Setting the Stage



In [1]:

    
# Importing the Python libraries we will use below #
import sys
import numpy as np
import scipy as scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import ggplot as gg
import seaborn as sns
from bokeh.plotting import *



In [2]:

    
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark -a 'Rene Clausen Nielsen, UN Global Pulse' -p pandas,numpy,scipy,geolocator,ggplot,matplotlib,mpld3,seaborn,bokeh -d -n -t -z -v -m -g









    



Rene Clausen Nielsen, UN Global Pulse 15/05/2015 18:53:16 UTC

CPython 2.7.9
IPython 3.1.0

pandas 0.16.1
numpy 1.9.2
scipy 0.15.1
geolocator 0.2.dev0
ggplot 0.6.5
matplotlib 1.4.3
mpld3 0.2
seaborn 0.5.1
bokeh 0.8.2

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 3.13.0-46-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   :



In [3]:

    
# Setting the chosen graphical styles #
%matplotlib inline
output_notebook()
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)
sns.set_context("poster")
sns.despine()
sns.set_style("whitegrid", {'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10, 'axes.titlesize': 12, 
                        'xtick.labelsize': 10, 'ytick.labelsize': 10, 'grid.linewidth': .2, 'axes.facecolor': ".97",
                        'grid.color': '.9', 'axes.edgecolor': '.9', 'font.family': ['sans-serif'], 'lines.solid_capstyle': 'round',
                        'font.sans-serif': ['Liberation Sans','Bitstream Vera Sans','sans-serif','Arial'],})









    




    
        
        
        
    
        
        BokehJS successfully loaded.
    






    





<matplotlib.figure.Figure at 0x7f653e689bd0>

Getting Population Data



In [4]:

    
populationData = pd.read_csv('../data/BrazilPopulation.csv', encoding='utf-8',)
populationData.sort("Population", ascending=False)









    Out[4]:






  
    
      
      City
      State
      Population
    
  
  
    
      24
      São Paulo
      SP
      11152968
    
    
      21
      Rio de Janeiro
      RJ
      6320446
    
    
      22
      Salvador
      BA
      2674923
    
    
      4
      Brasília
      DF
      2481272
    
    
      9
      Fortaleza
      CE
      2452185
    
    
      2
      Belo Horizonte
      MG
      2375151
    
    
      14
      Manaus
      AM
      1792881
    
    
      7
      Curitiba
      PR
      1751907
    
    
      19
      Recife
      PE
      1537704
    
    
      17
      Porto Alegre
      RS
      1409351
    
    
      1
      Belém
      PA
      1381475
    
    
      10
      Goiânia
      GO
      1297154
    
    
      23
      São Luís
      MA
      958545
    
    
      13
      Maceió
      AL
      932078
    
    
      15
      Natal
      RN
      803739
    
    
      5
      Campo Grande
      MS
      776242
    
    
      25
      Teresina
      PI
      767559
    
    
      11
      João Pessoa
      PB
      720954
    
    
      0
      Aracaju
      SE
      571149
    
    
      6
      Cuiabá
      MT
      540814
    
    
      8
      Florianópolis
      SC
      405189
    
    
      18
      Porto Velho
      RO
      392475
    
    
      12
      Macapá
      AP
      381091
    
    
      26
      Vitória
      ES
      327801
    
    
      20
      Rio Branco
      AC
      308545
    
    
      3
      Boa Vista
      RR
      277799
    
    
      16
      Palmas
      TO
      221742



In [5]:

    
populationData.sort("Population").plot(x = "City",
                   y = "Population",
                   kind = "barh", 
                   title = "Number of Inhabitants per City",
                   legend = False,
                   # colormap = cmap,
                   color = "#00aeef",
                   alpha=0.8)









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f653de85110>

Getting the Ground-Truth Data



In [6]:

    
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba.xls")



In [7]:

    
curitibaData.sheet_names









    Out[7]:





[u'Curitiba HIV Tests (Public)', u'Curitiba HIV Tests (Private)']



In [8]:

    
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True, index_col=0)



In [9]:

    
curitibaPublic









    Out[9]:






  
    
      
      2014-01
      2014-02
      2014-03
      2014-04
      2014-05
      2014-06
      2014-07
      2014-08
      2014-09
      2014-10
      2014-11
      2014-12
      2015-01
      2015-02
    
    
      HIV tests performed
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Primary Health Care services (females)
      1465
      1425
      1378
      1477
      1437
      1174
      1495
      1267
      1438
      1413
      720
      686
      1352
      1277
    
    
      Primary Health Care services (males)
      1057
      1056
      998
      1115
      1064
      840
      1197
      1009
      1250
      1187
      577
      617
      1157
      912
    
    
      Pregnant women in public health care
      2760
      2360
      2095
      2258
      2473
      2086
      2640
      2020
      2571
      2499
      1564
      1666
      3027
      2130
    
    
      HIV testing center (males)
      528
      358
      389
      383
      348
      307
      490
      404
      377
      370
      405
      430
      379
      437
    
    
      HIV testing center (females)
      186
      133
      158
      155
      155
      94
      197
      158
      107
      117
      111
      156
      112
      144



In [10]:

    
# curitibaPublic = curitibaPublic.loc[:,'2014-06':'2014-09'] # Including only months where we also have Twitter data 
# curitibaPublic

Turning the Table



In [11]:

    
curitibaPublic = curitibaPublic.transpose()
curitibaPublic









    Out[11]:






  
    
      HIV tests performed
      Primary Health Care services (females)
      Primary Health Care services (males)
      Pregnant women in public health care
      HIV testing center (males)
      HIV testing center (females)
    
  
  
    
      2014-01
      1465
      1057
      2760
      528
      186
    
    
      2014-02
      1425
      1056
      2360
      358
      133
    
    
      2014-03
      1378
      998
      2095
      389
      158
    
    
      2014-04
      1477
      1115
      2258
      383
      155
    
    
      2014-05
      1437
      1064
      2473
      348
      155
    
    
      2014-06
      1174
      840
      2086
      307
      94
    
    
      2014-07
      1495
      1197
      2640
      490
      197
    
    
      2014-08
      1267
      1009
      2020
      404
      158
    
    
      2014-09
      1438
      1250
      2571
      377
      107
    
    
      2014-10
      1413
      1187
      2499
      370
      117
    
    
      2014-11
      720
      577
      1564
      405
      111
    
    
      2014-12
      686
      617
      1666
      430
      156
    
    
      2015-01
      1352
      1157
      3027
      379
      112
    
    
      2015-02
      1277
      912
      2130
      437
      144



In [12]:

    
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
                          'HIV Testing, Females'] # Giving the columns fairly short and explanatory names
curitibaPublic = curitibaPublic.drop('Pregnant Women', 1) # Excluding numbers from matenal health facilities as they are tests delivered, not tests made
curitibaPublic









    Out[12]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
    
  
  
    
      2014-01
      1465
      1057
      528
      186
    
    
      2014-02
      1425
      1056
      358
      133
    
    
      2014-03
      1378
      998
      389
      158
    
    
      2014-04
      1477
      1115
      383
      155
    
    
      2014-05
      1437
      1064
      348
      155
    
    
      2014-06
      1174
      840
      307
      94
    
    
      2014-07
      1495
      1197
      490
      197
    
    
      2014-08
      1267
      1009
      404
      158
    
    
      2014-09
      1438
      1250
      377
      107
    
    
      2014-10
      1413
      1187
      370
      117
    
    
      2014-11
      720
      577
      405
      111
    
    
      2014-12
      686
      617
      430
      156
    
    
      2015-01
      1352
      1157
      379
      112
    
    
      2015-02
      1277
      912
      437
      144



In [13]:

    
curitibaPublic.dtypes # Figuring out what datatype each column is read as









    Out[13]:





Primary Care, Females    int64
Primary Care, Males      int64
HIV Testing, Males       int64
HIV Testing, Females     int64
dtype: object



In [14]:

    
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such



In [15]:

    
curitibaPublic # Checking that data looks the same after the datatype shenanigans









    Out[15]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
    
  
  
    
      2014-01-01
      1465
      1057
      528
      186
    
    
      2014-02-01
      1425
      1056
      358
      133
    
    
      2014-03-01
      1378
      998
      389
      158
    
    
      2014-04-01
      1477
      1115
      383
      155
    
    
      2014-05-01
      1437
      1064
      348
      155
    
    
      2014-06-01
      1174
      840
      307
      94
    
    
      2014-07-01
      1495
      1197
      490
      197
    
    
      2014-08-01
      1267
      1009
      404
      158
    
    
      2014-09-01
      1438
      1250
      377
      107
    
    
      2014-10-01
      1413
      1187
      370
      117
    
    
      2014-11-01
      720
      577
      405
      111
    
    
      2014-12-01
      686
      617
      430
      156
    
    
      2015-01-01
      1352
      1157
      379
      112
    
    
      2015-02-01
      1277
      912
      437
      144



In [16]:

    
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic









    Out[16]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
    
  
  
    
      2014-01-01
      1465
      1057
      528
      186
      3236
    
    
      2014-02-01
      1425
      1056
      358
      133
      2972
    
    
      2014-03-01
      1378
      998
      389
      158
      2923
    
    
      2014-04-01
      1477
      1115
      383
      155
      3130
    
    
      2014-05-01
      1437
      1064
      348
      155
      3004
    
    
      2014-06-01
      1174
      840
      307
      94
      2415
    
    
      2014-07-01
      1495
      1197
      490
      197
      3379
    
    
      2014-08-01
      1267
      1009
      404
      158
      2838
    
    
      2014-09-01
      1438
      1250
      377
      107
      3172
    
    
      2014-10-01
      1413
      1187
      370
      117
      3087
    
    
      2014-11-01
      720
      577
      405
      111
      1813
    
    
      2014-12-01
      686
      617
      430
      156
      1889
    
    
      2015-01-01
      1352
      1157
      379
      112
      3000
    
    
      2015-02-01
      1277
      912
      437
      144
      2770

Looking at the Ground-Truth Data

Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)



In [17]:

    
for col in curitibaPublic:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = curitibaPublic[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (curitibaPublic.index.min(),
                   curitibaPublic.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(curitibaPublic[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        curitibaPublic.index,                               # Variable values for the x-axis (index = dates)
        curitibaPublic[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
   # legend().label_text_font='Open Sans'
   # legend().label_text_color='#363636'
   # legend().border_line_color='#f6f6f6'
   # axis().axis_label_text_font = "Open Sans"
   # axis().axis_label_text_font_size = "12pt"
   # axis().axis_label_text_color = "#363636"
   # axis().major_label_text_font="Open Sans"
   # axis().major_label_text_font_size="10pt"
   # axis().minor_tick_line_color = "#d4d4d4"
   # xaxis().axis_line_color = '#d4d4d4'
   # xaxis().major_tick_line_color = "#d4d4d4"
   # yaxis().major_tick_line_color = None
   # yaxis().axis_line_color = None
   # xgrid().grid_line_color = None
   # ygrid().grid_line_color = "#d4d4d4"
    show(fig)

Below we'll insert lines for all topics in one chart to better compare.



In [46]:

    
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Groups', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,1800),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
#legend().label_text_font='Open Sans'
#legend().label_text_color='#363636'
#legend().border_line_color='#f6f6f6'
#axis().axis_label_text_font = "Open Sans"
#axis().axis_label_text_font_size = "12pt"
#axis().axis_label_text_color = "#363636"
#axis().major_label_text_font="Open Sans"
#axis().major_label_text_font_size="10pt"
#axis().minor_tick_line_color = "#d4d4d4"
#xaxis().axis_line_color = '#d4d4d4'
#xaxis().major_tick_line_color = "#d4d4d4"
#yaxis().major_tick_line_color = None
#yaxis().axis_line_color = None
#xgrid().grid_line_color = None
#ygrid().grid_line_color = "#d4d4d4"
#ygrid().grid_line_width = 0.5
show(fig)

Correlation Between Test Groups

We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.

First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.

As we're currently looking at timeseries correlations, we'll just use the default: Pearson.

Normal Distribution Test

Pearson assumes that the data is normal distributed. We can't really test that with only four data points per series, but the code below has been readied for furture use.



In [19]:

    
normalTestPrimaryFemales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Females"])
normalTestPrimaryMales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Males"])
normalTestHIVTestFemales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Females"])
normalTestHIVTestMales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Males"])
normalTestTotal = scipy.stats.mstats.normaltest(curitibaPublic["Curitiba Total"])

print('Normal Distribution Test for "Primary Care, Females": %s' % (normalTestPrimaryFemales,))
print('Normal Distribution Test for "Primary Care, Males": %s' % (normalTestPrimaryMales,))
print('Normal Distribution Test for "HIV Testing, Females": %s' % (normalTestHIVTestFemales,))
print('Normal Distribution Test for "HIV Testing, Males": %s' % (normalTestHIVTestMales,))
print('Normal Distribution Test for "Curitiba Total": %s' % (normalTestTotal,))

#curitibaPublic["Primary Care, Females"].normaltest()









    



Normal Distribution Test for "Primary Care, Females": (10.569402877642911, 0.0050685452331196016)
Normal Distribution Test for "Primary Care, Males": (3.844355404418017, 0.14628804320632191)
Normal Distribution Test for "HIV Testing, Females": (0.41082200345958769, 0.81431256375187089)
Normal Distribution Test for "HIV Testing, Males": (3.2429228913257564, 0.19760969212357282)
Normal Distribution Test for "Curitiba Total": (6.0730025398130856, 0.048002543979462284)






    



/home/ubuntu/anaconda/lib/python2.7/site-packages/scipy/stats/mstats_basic.py:1613: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=14
  np.min(n))

Histograms

As is the case with narmaltests above, we don't really have enough data points for a histogram to be useful as a visual indicator of being normal distributed, but again, for future use.

(gg.ggplot(gg.aes(x="Primary Care, Females"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="Primary Care, Males"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="HIV Testing, Females"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="HIV Testing, Males"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="Curitiba Total"), data=curitibaPublic) + gg.geom_histogram())

Correlation Matrix



In [20]:

    
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr









    Out[20]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
    
  
  
    
      Primary Care, Females
      1.000000
      0.930127
      0.021855
      0.266027
      0.975676
    
    
      Primary Care, Males
      0.930127
      1.000000
      0.005270
      0.152996
      0.957776
    
    
      HIV Testing, Males
      0.021855
      0.005270
      1.000000
      0.749629
      0.182782
    
    
      HIV Testing, Females
      0.266027
      0.152996
      0.749629
      1.000000
      0.367992
    
    
      Curitiba Total
      0.975676
      0.957776
      0.182782
      0.367992
      1.000000

curitibaPublicCorrKendall = curitibaPublic.corr(method='kendall') # Using kendall curitibaPublicCorrKendall



In [21]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublic, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()

Main Group Correlations



In [22]:

    
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");

Getting the Twitter Data



In [23]:

    
# Checking that the data file looks right #
!cat spark/output-final/all.csv | head









    



city,lat,lon,origdate,topic









cat: write error: Broken pipe



In [24]:

    
# Read in Twitter data file #
twitterData=pd.read_csv('spark/output-final/all.csv',
                          encoding='utf-8',
                          #header=None,
                          na_values=['NaN',''],
                          parse_dates=[3],
                          index_col=[3]
                        )



In [25]:

    
twitterData.head()









    Out[25]:






  
    
      
      city
      lat
      lon
      topic
    
    
      origdate
      
      
      
      
    
  
  
    
      2014-04-03 21:23:26
      Recife
      -8.057838
      -34.882897
      Discrimination_Negative
    
    
      2015-02-07 23:51:00
      São Paulo
      -23.500000
      -46.600000
      Discrimination_Negative
    
    
      2014-11-12 20:35:08
      Curitiba
      -25.428954
      -49.267137
      Discrimination_Negative
    
    
      2014-04-06 12:06:17
      Curitiba
      -25.428954
      -49.267137
      Discrimination_Negative
    
    
      2014-09-13 15:38:28
      São Paulo
      -23.500000
      -46.600000
      Discrimination_Negative



In [26]:

    
twitterDataCounts = pd.DataFrame({"Tweets" : twitterData.groupby(["city"]).size()}).reset_index()
twitterDataCounts.sort("Tweets", ascending=False)









    Out[26]:






  
    
      
      city
      Tweets
    
  
  
    
      24
      São Paulo
      20908
    
    
      21
      Rio de Janeiro
      13271
    
    
      17
      Porto Alegre
      10766
    
    
      7
      Curitiba
      7546
    
    
      1
      Belo Horizonte
      6933
    
    
      4
      Brasília
      6492
    
    
      2
      Belém
      4257
    
    
      26
      Vitória
      3474
    
    
      8
      Florianópolis
      3140
    
    
      19
      Recife
      3090
    
    
      9
      Fortaleza
      2622
    
    
      14
      Manaus
      2047
    
    
      22
      Salvador
      2036
    
    
      15
      Natal
      1759
    
    
      23
      São Luís
      1677
    
    
      5
      Campo Grande
      1594
    
    
      3
      Boa Vista
      1232
    
    
      13
      Maceió
      1067
    
    
      0
      Aracaju
      904
    
    
      10
      Goiânia
      849
    
    
      12
      Macapá
      846
    
    
      6
      Cuiabá
      808
    
    
      11
      João Pessoa
      797
    
    
      25
      Teresina
      753
    
    
      20
      Rio Branco
      629
    
    
      18
      Porto Velho
      450
    
    
      16
      Palmas
      186



In [27]:

    
twitterDataCounts.sort("Tweets").plot(x = "city", y = "Tweets", kind = "barh", title = "Number of Tweets per City",
                                legend = False, color="#00aeef", alpha=0.8)









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f653da08910>



In [28]:

    
cityData = pd.merge(twitterDataCounts, populationData, how='outer', left_on="city", right_on="City", copy=True)
cityData = cityData[["City","Population","Tweets"]]
cityData









    Out[28]:






  
    
      
      City
      Population
      Tweets
    
  
  
    
      0
      Aracaju
      571149
      904
    
    
      1
      Belo Horizonte
      2375151
      6933
    
    
      2
      Belém
      1381475
      4257
    
    
      3
      Boa Vista
      277799
      1232
    
    
      4
      Brasília
      2481272
      6492
    
    
      5
      Campo Grande
      776242
      1594
    
    
      6
      Cuiabá
      540814
      808
    
    
      7
      Curitiba
      1751907
      7546
    
    
      8
      Florianópolis
      405189
      3140
    
    
      9
      Fortaleza
      2452185
      2622
    
    
      10
      Goiânia
      1297154
      849
    
    
      11
      João Pessoa
      720954
      797
    
    
      12
      Macapá
      381091
      846
    
    
      13
      Maceió
      932078
      1067
    
    
      14
      Manaus
      1792881
      2047
    
    
      15
      Natal
      803739
      1759
    
    
      16
      Palmas
      221742
      186
    
    
      17
      Porto Alegre
      1409351
      10766
    
    
      18
      Porto Velho
      392475
      450
    
    
      19
      Recife
      1537704
      3090
    
    
      20
      Rio Branco
      308545
      629
    
    
      21
      Rio de Janeiro
      6320446
      13271
    
    
      22
      Salvador
      2674923
      2036
    
    
      23
      São Luís
      958545
      1677
    
    
      24
      São Paulo
      11152968
      20908
    
    
      25
      Teresina
      767559
      753
    
    
      26
      Vitória
      327801
      3474



In [29]:

    
cityData["Tweets per 1,000 inhabitants"] = cityData["Tweets"]/cityData["Population"]*1000
cityData.sort("Tweets per 1,000 inhabitants").plot(x = "City", y = "Tweets per 1,000 inhabitants", kind = "barh", 
                                                   title = "Tweets per 1,000 inhabitants", legend = False,
                                                  color="#00aeef", alpha=0.8)









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f653d40e510>

Case: Curitiba



In [30]:

    
twitterDataCuritiba = twitterData[twitterData['city'] == 'Curitiba'] # Getting Curitiba data only
twitterDataSmall = twitterDataCuritiba[['city','topic']] # Getting rid of columns we won't need
twitterDataSmall.head()









    Out[30]:






  
    
      
      city
      topic
    
    
      origdate
      
      
    
  
  
    
      2014-11-12 20:35:08
      Curitiba
      Discrimination_Negative
    
    
      2014-04-06 12:06:17
      Curitiba
      Discrimination_Negative
    
    
      2014-09-19 02:54:41
      Curitiba
      Discrimination_Negative
    
    
      2014-03-25 19:49:02
      Curitiba
      Discrimination_Negative
    
    
      2014-04-22 06:39:53
      Curitiba
      Discrimination_Negative



In [31]:

    
twitterDataSmall.describe()









    Out[31]:






  
    
      
      city
      topic
    
  
  
    
      count
      7546
      7546
    
    
      unique
      1
      7
    
    
      top
      Curitiba
      Discrimination_Negative
    
    
      freq
      7546
      5697



In [32]:

    
twitterDataSmall = pd.get_dummies(twitterDataSmall['topic'])
twitterDataSmall.head()









    Out[32]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
    
    
      origdate
      
      
      
      
      
      
      
    
  
  
    
      2014-11-12 20:35:08
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-04-06 12:06:17
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-09-19 02:54:41
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-03-25 19:49:02
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-04-22 06:39:53
      0
      1
      0
      0
      0
      0
      0



In [33]:

    
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg









    Out[33]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
    
      origdate
      
      
      
      
      
      
      
      
    
  
  
    
      2014-01-01
      6
      540
      12
      1
      6
      61
      1
      627
    
    
      2014-02-01
      0
      498
      38
      2
      11
      63
      0
      612
    
    
      2014-03-01
      2
      509
      39
      2
      21
      78
      0
      651
    
    
      2014-04-01
      0
      472
      53
      1
      11
      56
      1
      594
    
    
      2014-05-01
      0
      371
      38
      1
      5
      46
      1
      462
    
    
      2014-06-01
      0
      415
      18
      1
      9
      61
      1
      505
    
    
      2014-07-01
      5
      401
      12
      2
      12
      87
      2
      521
    
    
      2014-08-01
      1
      349
      59
      1
      5
      53
      2
      470
    
    
      2014-09-01
      0
      349
      107
      0
      7
      51
      0
      514
    
    
      2014-10-01
      0
      344
      62
      3
      8
      40
      0
      457
    
    
      2014-11-01
      1
      265
      30
      2
      12
      75
      1
      386
    
    
      2014-12-01
      8
      302
      36
      1
      28
      51
      4
      430
    
    
      2015-01-01
      1
      324
      27
      1
      31
      77
      3
      464
    
    
      2015-02-01
      3
      274
      37
      1
      26
      89
      5
      435
    
    
      2015-03-01
      9
      284
      52
      1
      5
      60
      7
      418

Looking at the Twitter Data



In [34]:

    
for col in twitterDataSmallAgg:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = twitterDataSmallAgg[col].name,                   # Plot title
        y_axis_label = 'Tweets',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (twitterDataSmallAgg.index.min(),
                   twitterDataSmallAgg.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        twitterDataSmallAgg.index,                               # Variable values for the x-axis (index = dates)
        twitterDataSmallAgg[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    #legend().label_text_font='Open Sans'
    #legend().label_text_color='#363636'
    #legend().border_line_color='#f6f6f6'
    #axis().axis_label_text_font = "Open Sans"
    #axis().axis_label_text_font_size = "12pt"
    #axis().axis_label_text_color = "#363636"
    #axis().major_label_text_font="Open Sans"
    #axis().major_label_text_font_size="10pt"
    #axis().minor_tick_line_color = "#d4d4d4"
    #xaxis().axis_line_color = '#d4d4d4'
    #xaxis().major_tick_line_color = "#d4d4d4"
    #yaxis().major_tick_line_color = None
    #yaxis().axis_line_color = None
    #xgrid().grid_line_color = None
    #ygrid().grid_line_color = "#d4d4d4"
    #ygrid().grid_line_width = 0.5
    show(fig)



In [35]:

    
curitibaTwitterCorr = twitterDataSmallAgg.corr() # Using default method: Pearson
curitibaTwitterCorr









    Out[35]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
  
  
    
      Campaign_Portuguese
      1.000000
      -0.173546
      -0.283673
      -0.126847
      0.155188
      0.157308
      0.720007
      -0.171034
    
    
      Discrimination_Negative
      -0.173546
      1.000000
      -0.236608
      0.124414
      -0.237682
      -0.033660
      -0.591547
      0.966719
    
    
      Discrimination_Positive
      -0.283673
      -0.236608
      1.000000
      -0.292206
      -0.264764
      -0.506955
      -0.130878
      -0.104225
    
    
      Prevention_Negative
      -0.126847
      0.124414
      -0.292206
      1.000000
      -0.007390
      0.106317
      -0.302603
      0.065439
    
    
      Prevention_Neutral
      0.155188
      -0.237682
      -0.264764
      -0.007390
      1.000000
      0.516719
      0.307874
      -0.120481
    
    
      Prevention_Positive
      0.157308
      -0.033660
      -0.506955
      0.106317
      0.516719
      1.000000
      0.254708
      0.065843
    
    
      Testing_Neutral
      0.720007
      -0.591547
      -0.130878
      -0.302603
      0.307874
      0.254708
      1.000000
      -0.552325
    
    
      Twitter Total
      -0.171034
      0.966719
      -0.104225
      0.065439
      -0.120481
      0.065843
      -0.552325
      1.000000

f, ax = plt.subplots(figsize=(8, 8)) sns.corrplot(twitterDataSmallAgg, annot=False, sig_stars=True, diag_names=False, cmap=cmap, ax=ax) f.tight_layout()

Merging Data



In [36]:

    
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True,
      suffixes=('_x', '_y'), copy=True)



In [37]:

    
df









    Out[37]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
  
  
    
      2014-01-01
      1465
      1057
      528
      186
      3236
      6
      540
      12
      1
      6
      61
      1
      627
    
    
      2014-02-01
      1425
      1056
      358
      133
      2972
      0
      498
      38
      2
      11
      63
      0
      612
    
    
      2014-03-01
      1378
      998
      389
      158
      2923
      2
      509
      39
      2
      21
      78
      0
      651
    
    
      2014-04-01
      1477
      1115
      383
      155
      3130
      0
      472
      53
      1
      11
      56
      1
      594
    
    
      2014-05-01
      1437
      1064
      348
      155
      3004
      0
      371
      38
      1
      5
      46
      1
      462
    
    
      2014-06-01
      1174
      840
      307
      94
      2415
      0
      415
      18
      1
      9
      61
      1
      505
    
    
      2014-07-01
      1495
      1197
      490
      197
      3379
      5
      401
      12
      2
      12
      87
      2
      521
    
    
      2014-08-01
      1267
      1009
      404
      158
      2838
      1
      349
      59
      1
      5
      53
      2
      470
    
    
      2014-09-01
      1438
      1250
      377
      107
      3172
      0
      349
      107
      0
      7
      51
      0
      514
    
    
      2014-10-01
      1413
      1187
      370
      117
      3087
      0
      344
      62
      3
      8
      40
      0
      457
    
    
      2014-11-01
      720
      577
      405
      111
      1813
      1
      265
      30
      2
      12
      75
      1
      386
    
    
      2014-12-01
      686
      617
      430
      156
      1889
      8
      302
      36
      1
      28
      51
      4
      430
    
    
      2015-01-01
      1352
      1157
      379
      112
      3000
      1
      324
      27
      1
      31
      77
      3
      464
    
    
      2015-02-01
      1277
      912
      437
      144
      2770
      3
      274
      37
      1
      26
      89
      5
      435

Comparisons



In [38]:

    
dfNoTotals = df.drop(df.columns[[4, 12]], axis=1)
dfNoTotalsCorr = dfNoTotals.corr() # Using default method: Pearson
dfNoTotalsCorr









    Out[38]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
    
  
  
    
      Primary Care, Females
      1.000000
      0.930127
      0.021855
      0.266027
      -0.340195
      0.572681
      0.157711
      -0.009420
      -0.356660
      -0.020886
      -0.380191
    
    
      Primary Care, Males
      0.930127
      1.000000
      0.005270
      0.152996
      -0.335326
      0.372802
      0.346881
      -0.030731
      -0.294514
      -0.121403
      -0.360078
    
    
      HIV Testing, Males
      0.021855
      0.005270
      1.000000
      0.749629
      0.787499
      0.115768
      -0.341484
      0.005826
      0.095528
      0.370342
      0.341341
    
    
      HIV Testing, Females
      0.266027
      0.152996
      0.749629
      1.000000
      0.629990
      0.398823
      -0.361133
      0.060185
      -0.061313
      0.194174
      0.188662
    
    
      Campaign_Portuguese
      -0.340195
      -0.335326
      0.787499
      0.629990
      1.000000
      -0.000118
      -0.443256
      -0.064861
      0.387285
      0.237851
      0.538479
    
    
      Discrimination_Negative
      0.572681
      0.372802
      0.115768
      0.398823
      -0.000118
      1.000000
      -0.211037
      0.091550
      -0.337992
      -0.053778
      -0.559711
    
    
      Discrimination_Positive
      0.157711
      0.346881
      -0.341484
      -0.361133
      -0.443256
      -0.211037
      1.000000
      -0.280880
      -0.243220
      -0.504345
      -0.299572
    
    
      Prevention_Negative
      -0.009420
      -0.030731
      0.005826
      0.060185
      -0.064861
      0.091550
      -0.280880
      1.000000
      -0.041254
      0.099711
      -0.298511
    
    
      Prevention_Neutral
      -0.356660
      -0.294514
      0.095528
      -0.061313
      0.387285
      -0.337992
      -0.243220
      -0.041254
      1.000000
      0.519410
      0.685470
    
    
      Prevention_Positive
      -0.020886
      -0.121403
      0.370342
      0.194174
      0.237851
      -0.053778
      -0.504345
      0.099711
      0.519410
      1.000000
      0.407901
    
    
      Testing_Neutral
      -0.380191
      -0.360078
      0.341341
      0.188662
      0.538479
      -0.559711
      -0.299572
      -0.298511
      0.685470
      0.407901
      1.000000



In [39]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfNoTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [68]:

    
dfTotals = df[["Curitiba Total", "Twitter Total"]]
dfTotalsCorr = dfTotals.corr() # Using default method: Pearson
dfTotalsCorr









    Out[68]:






  
    
      
      Curitiba Total
      Twitter Total
    
  
  
    
      Curitiba Total
      1.000000
      0.564951
    
    
      Twitter Total
      0.564951
      1.000000



In [76]:

    
dfTotals["Curitiba Tests Total"] = dfTotals["Curitiba Total"].sum()
dfTotals["Curitiba Tweets Total"] = dfTotals["Twitter Total"].sum()
dfTotals["Curitiba Tests Total %"] = dfTotals["Curitiba Total"]/dfTotals["Curitiba Tests Total"]*100
dfTotals["Curitiba Tweets Total %"] = dfTotals["Twitter Total"]/dfTotals["Curitiba Tweets Total"]*100









    



/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [77]:

    
dfTotals









    Out[77]:






  
    
      
      Curitiba Total
      Twitter Total
      Curitiba Tests Total
      Curitiba Tweets Total
      Curitiba Tests Total %
      Curitiba Tweets Total %
    
  
  
    
      2014-01-01
      3236
      627
      39628
      7128
      8.165943
      8.796296
    
    
      2014-02-01
      2972
      612
      39628
      7128
      7.499748
      8.585859
    
    
      2014-03-01
      2923
      651
      39628
      7128
      7.376098
      9.132997
    
    
      2014-04-01
      3130
      594
      39628
      7128
      7.898456
      8.333333
    
    
      2014-05-01
      3004
      462
      39628
      7128
      7.580499
      6.481481
    
    
      2014-06-01
      2415
      505
      39628
      7128
      6.094176
      7.084736
    
    
      2014-07-01
      3379
      521
      39628
      7128
      8.526799
      7.309203
    
    
      2014-08-01
      2838
      470
      39628
      7128
      7.161603
      6.593715
    
    
      2014-09-01
      3172
      514
      39628
      7128
      8.004441
      7.210999
    
    
      2014-10-01
      3087
      457
      39628
      7128
      7.789947
      6.411336
    
    
      2014-11-01
      1813
      386
      39628
      7128
      4.575048
      5.415264
    
    
      2014-12-01
      1889
      430
      39628
      7128
      4.766832
      6.032548
    
    
      2015-01-01
      3000
      464
      39628
      7128
      7.570405
      6.509540
    
    
      2015-02-01
      2770
      435
      39628
      7128
      6.990007
      6.102694



In [79]:

    
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'Tests vs. Tweets', y_axis_label = 'Monthly %', x_axis_label = 'Month',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (dfTotals.index.min(),dfTotals.index.max()), y_range = (0,10),
    )
fig.line(dfTotals.index, dfTotals["Curitiba Tests Total %"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Curitiba Tests"),
fig.line(dfTotals.index, dfTotals["Curitiba Tweets Total %"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "Curitiba Tweets")
show(fig)



In [41]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [86]:

    
sns.jointplot("Curitiba Total", "Twitter Total", df, kind="reg", color="#404040");



In [43]:

    
sns.jointplot("HIV Testing, Males", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [44]:

    
sns.jointplot("HIV Testing, Females", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [45]:

    
sns.jointplot("Primary Care, Females", "Prevention_Positive", df, kind="reg", color="#404040");

Anomaly Detection

We can only do anomaly detection on tweets, because we don't have enough data points for our ground-truth data. As we have very fine temporal information for all tweets (seconds), we can resample to look at hourly and daily aggregates. That shulkd give us enough data for anomaly detection.

First we resample to daily aggregates.



In [55]:

    
twitterDataDailyAgg = twitterDataSmall.resample('D', how='sum') # Resampling by summing each topic over each day
twitterDataDailyAgg['Twitter Total'] = twitterDataDailyAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataDailyAgg.head()









    Out[55]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
    
      origdate
      
      
      
      
      
      
      
      
    
  
  
    
      2014-01-01
      0
      15
      0
      0
      0
      1
      0
      16
    
    
      2014-01-02
      0
      16
      0
      0
      0
      0
      0
      16
    
    
      2014-01-03
      0
      15
      0
      0
      0
      0
      0
      15
    
    
      2014-01-04
      0
      13
      0
      0
      2
      4
      0
      19
    
    
      2014-01-05
      0
      12
      0
      0
      0
      4
      0
      16

As we'll be using r instead of Python for this, we'll use IPyhon's built-in r interpreter using the so called Magic Functions.



In [56]:

    
%load_ext rmagic









    



/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/extensions/rmagic.py:693: UserWarning: The rmagic extension in IPython is deprecated in favour of rpy2.ipython. If available, that will be loaded instead.
http://rpy.sourceforge.net/
  warnings.warn("The rmagic extension in IPython is deprecated in favour of "

%%R update.packages() install.packages("devtools") devtools::install_github("twitter/AnomalyDetection")



In [57]:

    
%R library(AnomalyDetection)









    Out[57]:





<StrVector - Python:0x7f6b1d9e5bd8 / R:0x3a06f68>
[str, str, str, ..., str, str, str]



In [58]:

    
# %%R
# help(AnomalyDetectionTs)
# help(AnomalyDetectionVec)



In [59]:

    
df_r = twitterDataDailyAgg['Twitter Total']
df_r.to_csv('TwitterDailyAgg.csv', header=['Twitter Total'], date_format='%Y-%m-%d')



In [60]:

    
!cat TwitterDailyAgg.csv | head









    



origdate,Twitter Total
2014-01-01,16.0
2014-01-02,16.0
2014-01-03,15.0
2014-01-04,19.0
2014-01-05,16.0
2014-01-06,21.0
2014-01-07,28.0
2014-01-08,14.0
2014-01-09,14.0



In [61]:

    
%%R

df_r = read.csv("TwitterDailyAgg.csv", stringsAsFactors=FALSE)



In [62]:

    
%R data(df_r)









    Out[62]:





<StrVector - Python:0x7f6b1d9e5878 / R:0x5cf56d8>
[str]



In [63]:

    
%R df_r









    Out[63]:





<DataFrame - Python:0x7f6b1d8fcef0 / R:0x4e64a38>
[StrVector, FloatVector]
  origdate: <class 'rpy2.robjects.vectors.StrVector'>
  <StrVector - Python:0x7f6b1d9795f0 / R:0x5baeb60>
[str, str, str, ..., str, str, str]
  Twitter.Total: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f6b1d9797e8 / R:0x5baf9d0>
[16.000000, 16.000000, 15.000000, ..., 11.000000, 12.000000, 17.000000]



In [64]:

    
%%R
data(df_r)
res = AnomalyDetectionTs(df_r, max_anoms=0.02, direction='both', plot=TRUE)
res$plot









    



Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero
In addition: Warning messages:
1: In data(df_r) : data set ‘df_r’ not found
2: In data(df_r) : data set ‘df_r’ not found
3: In max(ares) : no non-missing arguments to max; returning -Inf
4: In max(ares) : no non-missing arguments to max; returning -Inf
Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero

DELETE



In [51]:

    
%R raw_data









    Out[51]:





array([ <DataFrame - Python:0x7f36222810e0 / R:0x6e78748>
[Float..., IntVe..., IntVe..., ..., IntVe..., IntVe..., IntVe...]
  <no name>: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f3622281488 / R:0x785a320>
[0.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9ea8 / R:0x6611f80>
[       1,        2,        3, ...,       56,       57,       58]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e92d8 / R:0x7aae890>
[      14,       14,       14, ...,       13,       13,       13]
  ...
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e93f8 / R:0x6891040>
[       4,        4,        4, ...,        0,        0,        0]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9488 / R:0x76d8910>
[     268,      268,      268, ...,      278,      278,      278]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9248 / R:0x76e6a40>
[       0,        0,        0, ...,        0,        0,        0],
       <FloatVector - Python:0x7f36222811b8 / R:0x7a673c0>
[182.478000, 176.231000, 183.917000, ..., 153.776000, 150.481000, 146.638000]], dtype=object)



In [52]:

    
%%R

data(raw_data)
res = AnomalyDetectionTs(raw_data, max_anoms=0.02, direction='both', plot=TRUE)
res$plot

Styling



In [1]:

    
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)









    Out[1]:



In [ ]:

	City	State	Population
24	São Paulo	SP	11152968
21	Rio de Janeiro	RJ	6320446
22	Salvador	BA	2674923
4	Brasília	DF	2481272
9	Fortaleza	CE	2452185
2	Belo Horizonte	MG	2375151
14	Manaus	AM	1792881
7	Curitiba	PR	1751907
19	Recife	PE	1537704
17	Porto Alegre	RS	1409351
1	Belém	PA	1381475
10	Goiânia	GO	1297154
23	São Luís	MA	958545
13	Maceió	AL	932078
15	Natal	RN	803739
5	Campo Grande	MS	776242
25	Teresina	PI	767559
11	João Pessoa	PB	720954
0	Aracaju	SE	571149
6	Cuiabá	MT	540814
8	Florianópolis	SC	405189
18	Porto Velho	RO	392475
12	Macapá	AP	381091
26	Vitória	ES	327801
20	Rio Branco	AC	308545
3	Boa Vista	RR	277799
16	Palmas	TO	221742

	2014-01	2014-02	2014-03	2014-04	2014-05	2014-06	2014-07	2014-08	2014-09	2014-10	2014-11	2014-12	2015-01	2015-02
HIV tests performed
Primary Health Care services (females)	1465	1425	1378	1477	1437	1174	1495	1267	1438	1413	720	686	1352	1277
Primary Health Care services (males)	1057	1056	998	1115	1064	840	1197	1009	1250	1187	577	617	1157	912
Pregnant women in public health care	2760	2360	2095	2258	2473	2086	2640	2020	2571	2499	1564	1666	3027	2130
HIV testing center (males)	528	358	389	383	348	307	490	404	377	370	405	430	379	437
HIV testing center (females)	186	133	158	155	155	94	197	158	107	117	111	156	112	144

HIV tests performed	Primary Health Care services (females)	Primary Health Care services (males)	Pregnant women in public health care	HIV testing center (males)	HIV testing center (females)
2014-01	1465	1057	2760	528	186
2014-02	1425	1056	2360	358	133
2014-03	1378	998	2095	389	158
2014-04	1477	1115	2258	383	155
2014-05	1437	1064	2473	348	155
2014-06	1174	840	2086	307	94
2014-07	1495	1197	2640	490	197
2014-08	1267	1009	2020	404	158
2014-09	1438	1250	2571	377	107
2014-10	1413	1187	2499	370	117
2014-11	720	577	1564	405	111
2014-12	686	617	1666	430	156
2015-01	1352	1157	3027	379	112
2015-02	1277	912	2130	437	144

	Primary Care, Females	Primary Care, Males	HIV Testing, Males	HIV Testing, Females
2014-01-01	1465	1057	528	186
2014-02-01	1425	1056	358	133
2014-03-01	1378	998	389	158
2014-04-01	1477	1115	383	155
2014-05-01	1437	1064	348	155
2014-06-01	1174	840	307	94
2014-07-01	1495	1197	490	197
2014-08-01	1267	1009	404	158
2014-09-01	1438	1250	377	107
2014-10-01	1413	1187	370	117
2014-11-01	720	577	405	111
2014-12-01	686	617	430	156
2015-01-01	1352	1157	379	112
2015-02-01	1277	912	437	144

	Primary Care, Females	Primary Care, Males	HIV Testing, Males	HIV Testing, Females	Curitiba Total
Primary Care, Females	1.000000	0.930127	0.021855	0.266027	0.975676
Primary Care, Males	0.930127	1.000000	0.005270	0.152996	0.957776
HIV Testing, Males	0.021855	0.005270	1.000000	0.749629	0.182782
HIV Testing, Females	0.266027	0.152996	0.749629	1.000000	0.367992
Curitiba Total	0.975676	0.957776	0.182782	0.367992	1.000000

	city	lat	lon	topic
origdate
2014-04-03 21:23:26	Recife	-8.057838	-34.882897	Discrimination_Negative
2015-02-07 23:51:00	São Paulo	-23.500000	-46.600000	Discrimination_Negative
2014-11-12 20:35:08	Curitiba	-25.428954	-49.267137	Discrimination_Negative
2014-04-06 12:06:17	Curitiba	-25.428954	-49.267137	Discrimination_Negative
2014-09-13 15:38:28	São Paulo	-23.500000	-46.600000	Discrimination_Negative

	city	Tweets
24	São Paulo	20908
21	Rio de Janeiro	13271
17	Porto Alegre	10766
7	Curitiba	7546
1	Belo Horizonte	6933
4	Brasília	6492
2	Belém	4257
26	Vitória	3474
8	Florianópolis	3140
19	Recife	3090
9	Fortaleza	2622
14	Manaus	2047
22	Salvador	2036
15	Natal	1759
23	São Luís	1677
5	Campo Grande	1594
3	Boa Vista	1232
13	Maceió	1067
0	Aracaju	904
10	Goiânia	849
12	Macapá	846
6	Cuiabá	808
11	João Pessoa	797
25	Teresina	753
20	Rio Branco	629
18	Porto Velho	450
16	Palmas	186

	city	topic
count	7546	7546
unique	1	7
top	Curitiba	Discrimination_Negative
freq	7546	5697

	Campaign_Portuguese	Discrimination_Negative	Discrimination_Positive	Prevention_Negative	Prevention_Neutral	Prevention_Positive	Testing_Neutral	Twitter Total
Campaign_Portuguese	1.000000	-0.173546	-0.283673	-0.126847	0.155188	0.157308	0.720007	-0.171034
Discrimination_Negative	-0.173546	1.000000	-0.236608	0.124414	-0.237682	-0.033660	-0.591547	0.966719
Discrimination_Positive	-0.283673	-0.236608	1.000000	-0.292206	-0.264764	-0.506955	-0.130878	-0.104225
Prevention_Negative	-0.126847	0.124414	-0.292206	1.000000	-0.007390	0.106317	-0.302603	0.065439
Prevention_Neutral	0.155188	-0.237682	-0.264764	-0.007390	1.000000	0.516719	0.307874	-0.120481
Prevention_Positive	0.157308	-0.033660	-0.506955	0.106317	0.516719	1.000000	0.254708	0.065843
Testing_Neutral	0.720007	-0.591547	-0.130878	-0.302603	0.307874	0.254708	1.000000	-0.552325
Twitter Total	-0.171034	0.966719	-0.104225	0.065439	-0.120481	0.065843	-0.552325	1.000000

	Curitiba Total	Twitter Total	Curitiba Tests Total	Curitiba Tweets Total	Curitiba Tests Total %	Curitiba Tweets Total %
2014-01-01	3236	627	39628	7128	8.165943	8.796296
2014-02-01	2972	612	39628	7128	7.499748	8.585859
2014-03-01	2923	651	39628	7128	7.376098	9.132997
2014-04-01	3130	594	39628	7128	7.898456	8.333333
2014-05-01	3004	462	39628	7128	7.580499	6.481481
2014-06-01	2415	505	39628	7128	6.094176	7.084736
2014-07-01	3379	521	39628	7128	8.526799	7.309203
2014-08-01	2838	470	39628	7128	7.161603	6.593715
2014-09-01	3172	514	39628	7128	8.004441	7.210999
2014-10-01	3087	457	39628	7128	7.789947	6.411336
2014-11-01	1813	386	39628	7128	4.575048	5.415264
2014-12-01	1889	430	39628	7128	4.766832	6.032548
2015-01-01	3000	464	39628	7128	7.570405	6.509540
2015-02-01	2770	435	39628	7128	6.990007	6.102694

	Campaign_Portuguese	Discrimination_Negative	Discrimination_Positive	Prevention_Negative	Prevention_Neutral	Prevention_Positive	Testing_Neutral	Twitter Total
origdate
2014-01-01	0	15	0	0	0	1	0	16
2014-01-02	0	16	0	0	0	0	0	16
2014-01-03	0	15	0	0	0	0	0	15
2014-01-04	0	13	0	0	2	4	0	19
2014-01-05	0	12	0	0	0	4	0	16