Setting the Stage



In [1]:

    
# Importing the Python libraries we will use below #
import sys
import numpy as np
import scipy as scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import ggplot as gg
import seaborn as sns
from bokeh.plotting import *



In [2]:

    
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark -a 'Rene Clausen Nielsen, UN Global Pulse' -p pandas,numpy,scipy,geolocator,ggplot,matplotlib,mpld3,seaborn,bokeh -d -n -t -z -v -m -g









    



Rene Clausen Nielsen, UN Global Pulse 21/06/2015 15:15:09 UTC

CPython 2.7.10
IPython 3.1.0

pandas 0.16.2
numpy 1.9.2
scipy 0.15.1
geolocator 0.2.dev0
ggplot 0.6.5
matplotlib 1.4.3
mpld3 0.2
seaborn 0.5.1
bokeh 0.9.0

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 3.13.0-46-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   :



In [3]:

    
# Setting the chosen graphical styles #
%matplotlib inline
output_notebook()
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)
sns.set_context("poster")
sns.despine()
#sns.set_style("whitegrid", {'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10, 'axes.titlesize': 12, 
#                        'xtick.labelsize': 10, 'ytick.labelsize': 10, 'grid.linewidth': .2, 'axes.facecolor': ".97",
#                        'grid.color': '.9', 'axes.edgecolor': '.9', 'font.family': ['sans-serif'], 'lines.solid_capstyle': 'round',
#                        'font.sans-serif': ['Liberation Sans','Bitstream Vera Sans','sans-serif','Arial'],})
sns.set_style("whitegrid", {'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10, 'axes.titlesize': 12, 
                        'xtick.labelsize': 10, 'ytick.labelsize': 10, 'grid.linewidth': .2, 'grid.color': '.9', 
                            'axes.edgecolor': '.9'})









    




    
        
        
        
    
        
        BokehJS successfully loaded.
    






    





<matplotlib.figure.Figure at 0x7ff92bbd6a10>

Getting Population Data



In [4]:

    
populationData = pd.read_csv('../data/BrazilPopulation2014.csv', encoding='utf-8',)
populationData.sort("Population", ascending=False)









    Out[4]:






  
    
      
      City
      State
      Population
    
  
  
    
      24
      São Paulo
      SP
      11895893
    
    
      21
      Rio de Janeiro
      RJ
      6453682
    
    
      22
      Salvador
      BA
      2902927
    
    
      4
      Brasília
      DF
      2852372
    
    
      9
      Fortaleza
      CE
      2571896
    
    
      2
      Belo Horizonte
      MG
      2491109
    
    
      14
      Manaus
      AM
      2020301
    
    
      7
      Curitiba
      PR
      1864416
    
    
      19
      Recife
      PE
      1608488
    
    
      17
      Porto Alegre
      RS
      1472482
    
    
      1
      Belém
      PA
      1432844
    
    
      10
      Goiânia
      GO
      1412364
    
    
      23
      São Luís
      MA
      1064197
    
    
      13
      Maceió
      AL
      1005319
    
    
      15
      Natal
      RN
      862044
    
    
      5
      Campo Grande
      MS
      843120
    
    
      25
      Teresina
      PI
      840600
    
    
      11
      João Pessoa
      PB
      780738
    
    
      0
      Aracaju
      SE
      623766
    
    
      6
      Cuiabá
      MT
      575480
    
    
      18
      Porto Velho
      RO
      494013
    
    
      8
      Florianópolis
      SC
      461524
    
    
      12
      Macapá
      AP
      446757
    
    
      20
      Rio Branco
      AC
      363928
    
    
      26
      Vitória
      ES
      352104
    
    
      3
      Boa Vista
      RR
      314900
    
    
      16
      Palmas
      TO
      265409



In [5]:

    
populationData.sort("Population").plot(x = "City",
                   y = "Population",
                   kind = "barh", 
                   title = "Number of Inhabitants per City",
                   legend = False,
                   # colormap = cmap,
                   color = "#00447c",
                   alpha=1)









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7ff92ac54410>

Getting the Ground-Truth Data



In [6]:

    
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba.xls")



In [7]:

    
curitibaData.sheet_names









    Out[7]:





[u'Curitiba HIV Tests (Public)', u'Curitiba HIV Tests (Private)']



In [8]:

    
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True, index_col=0)



In [9]:

    
curitibaPublic









    Out[9]:






  
    
      
      2014-01
      2014-02
      2014-03
      2014-04
      2014-05
      2014-06
      2014-07
      2014-08
      2014-09
      2014-10
      2014-11
      2014-12
      2015-01
      2015-02
      2015-03
    
    
      HIV tests performed
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Primary Health Care services (females)
      1465
      1425
      1378
      1477
      1437
      1174
      1495
      1267
      1438
      1413
      720
      686
      1352
      1277
      1628
    
    
      Primary Health Care services (males)
      1057
      1056
      998
      1115
      1064
      840
      1197
      1009
      1250
      1187
      577
      617
      1157
      912
      1169
    
    
      Pregnant women in public health care
      2760
      2360
      2095
      2258
      2473
      2086
      2640
      2020
      2571
      2499
      1564
      1666
      3027
      2130
      2414
    
    
      HIV testing center (males)
      528
      358
      389
      383
      348
      307
      490
      404
      377
      370
      405
      430
      379
      437
      572
    
    
      HIV testing center (females)
      186
      133
      158
      155
      155
      94
      197
      158
      107
      117
      111
      156
      112
      144
      211



In [10]:

    
# curitibaPublic = curitibaPublic.loc[:,'2014-06':'2014-09'] # Including only months where we also have Twitter data 
# curitibaPublic

Turning the Table



In [11]:

    
curitibaPublic = curitibaPublic.transpose()
curitibaPublic









    Out[11]:






  
    
      HIV tests performed
      Primary Health Care services (females)
      Primary Health Care services (males)
      Pregnant women in public health care
      HIV testing center (males)
      HIV testing center (females)
    
  
  
    
      2014-01
      1465
      1057
      2760
      528
      186
    
    
      2014-02
      1425
      1056
      2360
      358
      133
    
    
      2014-03
      1378
      998
      2095
      389
      158
    
    
      2014-04
      1477
      1115
      2258
      383
      155
    
    
      2014-05
      1437
      1064
      2473
      348
      155
    
    
      2014-06
      1174
      840
      2086
      307
      94
    
    
      2014-07
      1495
      1197
      2640
      490
      197
    
    
      2014-08
      1267
      1009
      2020
      404
      158
    
    
      2014-09
      1438
      1250
      2571
      377
      107
    
    
      2014-10
      1413
      1187
      2499
      370
      117
    
    
      2014-11
      720
      577
      1564
      405
      111
    
    
      2014-12
      686
      617
      1666
      430
      156
    
    
      2015-01
      1352
      1157
      3027
      379
      112
    
    
      2015-02
      1277
      912
      2130
      437
      144
    
    
      2015-03
      1628
      1169
      2414
      572
      211



In [12]:

    
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
                          'HIV Testing, Females'] # Giving the columns fairly short and explanatory names
curitibaPublic = curitibaPublic.drop('Pregnant Women', 1) # Excluding numbers from matenal health facilities as they are tests delivered, not tests made
curitibaPublic









    Out[12]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
    
  
  
    
      2014-01
      1465
      1057
      528
      186
    
    
      2014-02
      1425
      1056
      358
      133
    
    
      2014-03
      1378
      998
      389
      158
    
    
      2014-04
      1477
      1115
      383
      155
    
    
      2014-05
      1437
      1064
      348
      155
    
    
      2014-06
      1174
      840
      307
      94
    
    
      2014-07
      1495
      1197
      490
      197
    
    
      2014-08
      1267
      1009
      404
      158
    
    
      2014-09
      1438
      1250
      377
      107
    
    
      2014-10
      1413
      1187
      370
      117
    
    
      2014-11
      720
      577
      405
      111
    
    
      2014-12
      686
      617
      430
      156
    
    
      2015-01
      1352
      1157
      379
      112
    
    
      2015-02
      1277
      912
      437
      144
    
    
      2015-03
      1628
      1169
      572
      211



In [13]:

    
curitibaPublic.dtypes # Figuring out what datatype each column is read as









    Out[13]:





Primary Care, Females    int64
Primary Care, Males      int64
HIV Testing, Males       int64
HIV Testing, Females     int64
dtype: object



In [14]:

    
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such



In [15]:

    
curitibaPublic # Checking that data looks the same after the datatype shenanigans









    Out[15]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
    
  
  
    
      2014-01-01
      1465
      1057
      528
      186
    
    
      2014-02-01
      1425
      1056
      358
      133
    
    
      2014-03-01
      1378
      998
      389
      158
    
    
      2014-04-01
      1477
      1115
      383
      155
    
    
      2014-05-01
      1437
      1064
      348
      155
    
    
      2014-06-01
      1174
      840
      307
      94
    
    
      2014-07-01
      1495
      1197
      490
      197
    
    
      2014-08-01
      1267
      1009
      404
      158
    
    
      2014-09-01
      1438
      1250
      377
      107
    
    
      2014-10-01
      1413
      1187
      370
      117
    
    
      2014-11-01
      720
      577
      405
      111
    
    
      2014-12-01
      686
      617
      430
      156
    
    
      2015-01-01
      1352
      1157
      379
      112
    
    
      2015-02-01
      1277
      912
      437
      144
    
    
      2015-03-01
      1628
      1169
      572
      211



In [16]:

    
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic









    Out[16]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
    
  
  
    
      2014-01-01
      1465
      1057
      528
      186
      3236
    
    
      2014-02-01
      1425
      1056
      358
      133
      2972
    
    
      2014-03-01
      1378
      998
      389
      158
      2923
    
    
      2014-04-01
      1477
      1115
      383
      155
      3130
    
    
      2014-05-01
      1437
      1064
      348
      155
      3004
    
    
      2014-06-01
      1174
      840
      307
      94
      2415
    
    
      2014-07-01
      1495
      1197
      490
      197
      3379
    
    
      2014-08-01
      1267
      1009
      404
      158
      2838
    
    
      2014-09-01
      1438
      1250
      377
      107
      3172
    
    
      2014-10-01
      1413
      1187
      370
      117
      3087
    
    
      2014-11-01
      720
      577
      405
      111
      1813
    
    
      2014-12-01
      686
      617
      430
      156
      1889
    
    
      2015-01-01
      1352
      1157
      379
      112
      3000
    
    
      2015-02-01
      1277
      912
      437
      144
      2770
    
    
      2015-03-01
      1628
      1169
      572
      211
      3580

Looking at the Ground-Truth Data

Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)



In [17]:

    
for col in curitibaPublic:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = curitibaPublic[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FFFFFF',                        # Background colour for plot area (#FAFAFA)
        outline_line_color = '#FFFFFF',                     # Colour of line sorrounding plot (#FAFAFA)
        border_fill = '#FFFFFF',                            # Background colour for surrounding area (#FAFAFA)
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (curitibaPublic.index.min(),
                   curitibaPublic.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(curitibaPublic[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        curitibaPublic.index,                               # Variable values for the x-axis (index = dates)
        curitibaPublic[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
   # legend().label_text_font='Open Sans'
   # legend().label_text_color='#363636'
   # legend().border_line_color='#f6f6f6'
   # axis().axis_label_text_font = "Open Sans"
   # axis().axis_label_text_font_size = "12pt"
   # axis().axis_label_text_color = "#363636"
   # axis().major_label_text_font="Open Sans"
   # axis().major_label_text_font_size="10pt"
   # axis().minor_tick_line_color = "#d4d4d4"
   # xaxis().axis_line_color = '#d4d4d4'
   # xaxis().major_tick_line_color = "#d4d4d4"
   # yaxis().major_tick_line_color = None
   # yaxis().axis_line_color = None
   # xgrid().grid_line_color = None
   # ygrid().grid_line_color = "#d4d4d4"
    show(fig)

Below we'll insert lines for all topics in one chart to better compare.



In [18]:

    
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Groups', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FFFFFF',
    outline_line_color = '#FFFFFF', border_fill = '#FFFFFF', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,1800),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
#legend().label_text_font='Open Sans'
#legend().label_text_color='#363636'
#legend().border_line_color='#f6f6f6'
#axis().axis_label_text_font = "Open Sans"
#axis().axis_label_text_font_size = "12pt"
#axis().axis_label_text_color = "#363636"
#axis().major_label_text_font="Open Sans"
#axis().major_label_text_font_size="10pt"
#axis().minor_tick_line_color = "#d4d4d4"
#xaxis().axis_line_color = '#d4d4d4'
#xaxis().major_tick_line_color = "#d4d4d4"
#yaxis().major_tick_line_color = None
#yaxis().axis_line_color = None
#xgrid().grid_line_color = None
#ygrid().grid_line_color = "#d4d4d4"
#ygrid().grid_line_width = 0.5
show(fig)

Correlation Between Test Groups

We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.

First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.

As we're currently looking at timeseries correlations, we'll just use the default: Pearson.

Normal Distribution Test

Pearson assumes that the data is normal distributed. We can't really test that with only four data points per series, but the code below has been readied for furture use.



In [19]:

    
normalTestPrimaryFemales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Females"])
normalTestPrimaryMales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Males"])
normalTestHIVTestFemales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Females"])
normalTestHIVTestMales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Males"])
normalTestTotal = scipy.stats.mstats.normaltest(curitibaPublic["Curitiba Total"])

print('Normal Distribution Test for "Primary Care, Females": %s' % (normalTestPrimaryFemales,))
print('Normal Distribution Test for "Primary Care, Males": %s' % (normalTestPrimaryMales,))
print('Normal Distribution Test for "HIV Testing, Females": %s' % (normalTestHIVTestFemales,))
print('Normal Distribution Test for "HIV Testing, Males": %s' % (normalTestHIVTestMales,))
print('Normal Distribution Test for "Curitiba Total": %s' % (normalTestTotal,))

#curitibaPublic["Primary Care, Females"].normaltest()









    



Normal Distribution Test for "Primary Care, Females": (10.183630689925293, 0.0061468510974571266)
Normal Distribution Test for "Primary Care, Males": (4.8762898611677077, 0.087322690957151297)
Normal Distribution Test for "HIV Testing, Females": (0.54129509083924487, 0.76288533145558435)
Normal Distribution Test for "HIV Testing, Males": (3.9888908656263116, 0.13608910379542336)
Normal Distribution Test for "Curitiba Total": (5.0051627612720004, 0.081873379250532202)






    



/home/ubuntu/anaconda/lib/python2.7/site-packages/scipy/stats/mstats_basic.py:1613: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=15
  np.min(n))

Histograms

As is the case with narmaltests above, we don't really have enough data points for a histogram to be useful as a visual indicator of being normal distributed, but again, for future use.

(gg.ggplot(gg.aes(x="Primary Care, Females"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="Primary Care, Males"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="HIV Testing, Females"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="HIV Testing, Males"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="Curitiba Total"), data=curitibaPublic) + gg.geom_histogram())

Correlation Matrix



In [20]:

    
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr









    Out[20]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
    
  
  
    
      Primary Care, Females
      1.000000
      0.928283
      0.222689
      0.385662
      0.976710
    
    
      Primary Care, Males
      0.928283
      1.000000
      0.137849
      0.238497
      0.944786
    
    
      HIV Testing, Males
      0.222689
      0.137849
      1.000000
      0.824896
      0.375919
    
    
      HIV Testing, Females
      0.385662
      0.238497
      0.824896
      1.000000
      0.492247
    
    
      Curitiba Total
      0.976710
      0.944786
      0.375919
      0.492247
      1.000000

curitibaPublicCorrKendall = curitibaPublic.corr(method='kendall') # Using kendall curitibaPublicCorrKendall



In [21]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublic, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()

Main Group Correlations



In [22]:

    
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");

Getting the Twitter Data



In [23]:

    
# Checking that the data file looks right #
!cat spark/output-final/all.csv | head









    



city,lat,lon,origdate,topic









cat: write error: Broken pipe



In [24]:

    
# Read in Twitter data file #
twitterData=pd.read_csv('spark/output-final/all.csv',
                          encoding='utf-8',
                          #header=None,
                          na_values=['NaN',''],
                          parse_dates=[3],
                          index_col=[3]
                        )



In [25]:

    
twitterData.head()









    Out[25]:






  
    
      
      city
      lat
      lon
      topic
    
    
      origdate
      
      
      
      
    
  
  
    
      2014-04-03 21:23:26
      Recife
      -8.057838
      -34.882897
      Discrimination_Negative
    
    
      2015-02-07 23:51:00
      São Paulo
      -23.500000
      -46.600000
      Discrimination_Negative
    
    
      2014-11-12 20:35:08
      Curitiba
      -25.428954
      -49.267137
      Discrimination_Negative
    
    
      2014-04-06 12:06:17
      Curitiba
      -25.428954
      -49.267137
      Discrimination_Negative
    
    
      2014-09-13 15:38:28
      São Paulo
      -23.500000
      -46.600000
      Discrimination_Negative



In [26]:

    
twitterDataCounts = pd.DataFrame({"Tweets" : twitterData.groupby(["city"]).size()}).reset_index()
twitterDataCounts.sort("Tweets", ascending=False)









    Out[26]:






  
    
      
      city
      Tweets
    
  
  
    
      24
      São Paulo
      20908
    
    
      21
      Rio de Janeiro
      13271
    
    
      17
      Porto Alegre
      10766
    
    
      7
      Curitiba
      7546
    
    
      1
      Belo Horizonte
      6933
    
    
      4
      Brasília
      6492
    
    
      2
      Belém
      4257
    
    
      26
      Vitória
      3474
    
    
      8
      Florianópolis
      3140
    
    
      19
      Recife
      3090
    
    
      9
      Fortaleza
      2622
    
    
      14
      Manaus
      2047
    
    
      22
      Salvador
      2036
    
    
      15
      Natal
      1759
    
    
      23
      São Luís
      1677
    
    
      5
      Campo Grande
      1594
    
    
      3
      Boa Vista
      1232
    
    
      13
      Maceió
      1067
    
    
      0
      Aracaju
      904
    
    
      10
      Goiânia
      849
    
    
      12
      Macapá
      846
    
    
      6
      Cuiabá
      808
    
    
      11
      João Pessoa
      797
    
    
      25
      Teresina
      753
    
    
      20
      Rio Branco
      629
    
    
      18
      Porto Velho
      450
    
    
      16
      Palmas
      186



In [27]:

    
twitterDataCounts.sort("Tweets").plot(x = "city", y = "Tweets", kind = "barh", title = "Number of Tweets per City",
                                legend = False, color="#00447c", alpha=0.8)









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x7ff92b0c98d0>



In [28]:

    
cityData = pd.merge(twitterDataCounts, populationData, how='outer', left_on="city", right_on="City", copy=True)
cityData = cityData[["City","Population","Tweets"]]
cityData









    Out[28]:






  
    
      
      City
      Population
      Tweets
    
  
  
    
      0
      Aracaju
      623766
      904
    
    
      1
      Belo Horizonte
      2491109
      6933
    
    
      2
      Belém
      1432844
      4257
    
    
      3
      Boa Vista
      314900
      1232
    
    
      4
      Brasília
      2852372
      6492
    
    
      5
      Campo Grande
      843120
      1594
    
    
      6
      Cuiabá
      575480
      808
    
    
      7
      Curitiba
      1864416
      7546
    
    
      8
      Florianópolis
      461524
      3140
    
    
      9
      Fortaleza
      2571896
      2622
    
    
      10
      Goiânia
      1412364
      849
    
    
      11
      João Pessoa
      780738
      797
    
    
      12
      Macapá
      446757
      846
    
    
      13
      Maceió
      1005319
      1067
    
    
      14
      Manaus
      2020301
      2047
    
    
      15
      Natal
      862044
      1759
    
    
      16
      Palmas
      265409
      186
    
    
      17
      Porto Alegre
      1472482
      10766
    
    
      18
      Porto Velho
      494013
      450
    
    
      19
      Recife
      1608488
      3090
    
    
      20
      Rio Branco
      363928
      629
    
    
      21
      Rio de Janeiro
      6453682
      13271
    
    
      22
      Salvador
      2902927
      2036
    
    
      23
      São Luís
      1064197
      1677
    
    
      24
      São Paulo
      11895893
      20908
    
    
      25
      Teresina
      840600
      753
    
    
      26
      Vitória
      352104
      3474



In [29]:

    
cityData["Tweets per 1,000 inhabitants"] = cityData["Tweets"]/cityData["Population"]*1000/15
cityData.sort("Tweets per 1,000 inhabitants").plot(x = "City", y = "Tweets per 1,000 inhabitants", kind = "barh", 
                                                   title = "Monthly Tweets per 1,000 inhabitants", legend = False,
                                                  color="#00447c", alpha=0.8)









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x7ff92afdfe50>

Case: Curitiba



In [30]:

    
twitterDataCuritiba = twitterData[twitterData['city'] == 'Curitiba'] # Getting Curitiba data only
twitterDataSmall = twitterDataCuritiba[['city','topic']] # Getting rid of columns we won't need
twitterDataSmall.head()









    Out[30]:






  
    
      
      city
      topic
    
    
      origdate
      
      
    
  
  
    
      2014-11-12 20:35:08
      Curitiba
      Discrimination_Negative
    
    
      2014-04-06 12:06:17
      Curitiba
      Discrimination_Negative
    
    
      2014-09-19 02:54:41
      Curitiba
      Discrimination_Negative
    
    
      2014-03-25 19:49:02
      Curitiba
      Discrimination_Negative
    
    
      2014-04-22 06:39:53
      Curitiba
      Discrimination_Negative



In [31]:

    
twitterDataSmall.describe()









    Out[31]:






  
    
      
      city
      topic
    
  
  
    
      count
      7546
      7546
    
    
      unique
      1
      7
    
    
      top
      Curitiba
      Discrimination_Negative
    
    
      freq
      7546
      5697



In [32]:

    
twitterDataSmall = pd.get_dummies(twitterDataSmall['topic'])
twitterDataSmall.head()









    Out[32]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
    
    
      origdate
      
      
      
      
      
      
      
    
  
  
    
      2014-11-12 20:35:08
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-04-06 12:06:17
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-09-19 02:54:41
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-03-25 19:49:02
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-04-22 06:39:53
      0
      1
      0
      0
      0
      0
      0



In [33]:

    
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg









    Out[33]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
    
      origdate
      
      
      
      
      
      
      
      
    
  
  
    
      2014-01-01
      6
      540
      12
      1
      6
      61
      1
      627
    
    
      2014-02-01
      0
      498
      38
      2
      11
      63
      0
      612
    
    
      2014-03-01
      2
      509
      39
      2
      21
      78
      0
      651
    
    
      2014-04-01
      0
      472
      53
      1
      11
      56
      1
      594
    
    
      2014-05-01
      0
      371
      38
      1
      5
      46
      1
      462
    
    
      2014-06-01
      0
      415
      18
      1
      9
      61
      1
      505
    
    
      2014-07-01
      5
      401
      12
      2
      12
      87
      2
      521
    
    
      2014-08-01
      1
      349
      59
      1
      5
      53
      2
      470
    
    
      2014-09-01
      0
      349
      107
      0
      7
      51
      0
      514
    
    
      2014-10-01
      0
      344
      62
      3
      8
      40
      0
      457
    
    
      2014-11-01
      1
      265
      30
      2
      12
      75
      1
      386
    
    
      2014-12-01
      8
      302
      36
      1
      28
      51
      4
      430
    
    
      2015-01-01
      1
      324
      27
      1
      31
      77
      3
      464
    
    
      2015-02-01
      3
      274
      37
      1
      26
      89
      5
      435
    
    
      2015-03-01
      9
      284
      52
      1
      5
      60
      7
      418

Looking at the Twitter Data



In [34]:

    
for col in twitterDataSmallAgg:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = twitterDataSmallAgg[col].name,                   # Plot title
        y_axis_label = 'Tweets',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FFFFFF',                        # Background colour for plot area
        outline_line_color = '#FFFFFF',                     # Colour of line sorrounding plot
        border_fill = '#FFFFFF',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (twitterDataSmallAgg.index.min(),
                   twitterDataSmallAgg.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        twitterDataSmallAgg.index,                               # Variable values for the x-axis (index = dates)
        twitterDataSmallAgg[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    #legend().label_text_font='Open Sans'
    #legend().label_text_color='#363636'
    #legend().border_line_color='#f6f6f6'
    #axis().axis_label_text_font = "Open Sans"
    #axis().axis_label_text_font_size = "12pt"
    #axis().axis_label_text_color = "#363636"
    #axis().major_label_text_font="Open Sans"
    #axis().major_label_text_font_size="10pt"
    #axis().minor_tick_line_color = "#d4d4d4"
    #xaxis().axis_line_color = '#d4d4d4'
    #xaxis().major_tick_line_color = "#d4d4d4"
    #yaxis().major_tick_line_color = None
    #yaxis().axis_line_color = None
    #xgrid().grid_line_color = None
    #ygrid().grid_line_color = "#d4d4d4"
    #ygrid().grid_line_width = 0.5
    show(fig)



In [35]:

    
curitibaTwitterCorr = twitterDataSmallAgg.corr() # Using default method: Pearson
curitibaTwitterCorr









    Out[35]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
  
  
    
      Campaign_Portuguese
      1.000000
      -0.173546
      -0.283673
      -0.126847
      0.155188
      0.157308
      0.720007
      -0.171034
    
    
      Discrimination_Negative
      -0.173546
      1.000000
      -0.236608
      0.124414
      -0.237682
      -0.033660
      -0.591547
      0.966719
    
    
      Discrimination_Positive
      -0.283673
      -0.236608
      1.000000
      -0.292206
      -0.264764
      -0.506955
      -0.130878
      -0.104225
    
    
      Prevention_Negative
      -0.126847
      0.124414
      -0.292206
      1.000000
      -0.007390
      0.106317
      -0.302603
      0.065439
    
    
      Prevention_Neutral
      0.155188
      -0.237682
      -0.264764
      -0.007390
      1.000000
      0.516719
      0.307874
      -0.120481
    
    
      Prevention_Positive
      0.157308
      -0.033660
      -0.506955
      0.106317
      0.516719
      1.000000
      0.254708
      0.065843
    
    
      Testing_Neutral
      0.720007
      -0.591547
      -0.130878
      -0.302603
      0.307874
      0.254708
      1.000000
      -0.552325
    
    
      Twitter Total
      -0.171034
      0.966719
      -0.104225
      0.065439
      -0.120481
      0.065843
      -0.552325
      1.000000

f, ax = plt.subplots(figsize=(8, 8)) sns.corrplot(twitterDataSmallAgg, annot=False, sig_stars=True, diag_names=False, cmap=cmap, ax=ax) f.tight_layout()

Merging Data



In [36]:

    
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True,
      suffixes=('_x', '_y'), copy=True)



In [37]:

    
df









    Out[37]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
  
  
    
      2014-01-01
      1465
      1057
      528
      186
      3236
      6
      540
      12
      1
      6
      61
      1
      627
    
    
      2014-02-01
      1425
      1056
      358
      133
      2972
      0
      498
      38
      2
      11
      63
      0
      612
    
    
      2014-03-01
      1378
      998
      389
      158
      2923
      2
      509
      39
      2
      21
      78
      0
      651
    
    
      2014-04-01
      1477
      1115
      383
      155
      3130
      0
      472
      53
      1
      11
      56
      1
      594
    
    
      2014-05-01
      1437
      1064
      348
      155
      3004
      0
      371
      38
      1
      5
      46
      1
      462
    
    
      2014-06-01
      1174
      840
      307
      94
      2415
      0
      415
      18
      1
      9
      61
      1
      505
    
    
      2014-07-01
      1495
      1197
      490
      197
      3379
      5
      401
      12
      2
      12
      87
      2
      521
    
    
      2014-08-01
      1267
      1009
      404
      158
      2838
      1
      349
      59
      1
      5
      53
      2
      470
    
    
      2014-09-01
      1438
      1250
      377
      107
      3172
      0
      349
      107
      0
      7
      51
      0
      514
    
    
      2014-10-01
      1413
      1187
      370
      117
      3087
      0
      344
      62
      3
      8
      40
      0
      457
    
    
      2014-11-01
      720
      577
      405
      111
      1813
      1
      265
      30
      2
      12
      75
      1
      386
    
    
      2014-12-01
      686
      617
      430
      156
      1889
      8
      302
      36
      1
      28
      51
      4
      430
    
    
      2015-01-01
      1352
      1157
      379
      112
      3000
      1
      324
      27
      1
      31
      77
      3
      464
    
    
      2015-02-01
      1277
      912
      437
      144
      2770
      3
      274
      37
      1
      26
      89
      5
      435
    
    
      2015-03-01
      1628
      1169
      572
      211
      3580
      9
      284
      52
      1
      5
      60
      7
      418

Comparisons



In [38]:

    
dfNoTotals = df.drop(df.columns[[4, 12]], axis=1)
dfNoTotalsCorr = dfNoTotals.corr() # Using default method: Pearson
dfNoTotalsCorr









    Out[38]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
    
  
  
    
      Primary Care, Females
      1.000000
      0.928283
      0.222689
      0.385662
      -0.067183
      0.419378
      0.188460
      -0.050735
      -0.409009
      -0.039294
      -0.034574
    
    
      Primary Care, Males
      0.928283
      1.000000
      0.137849
      0.238497
      -0.140317
      0.284962
      0.362658
      -0.056928
      -0.332257
      -0.131103
      -0.108993
    
    
      HIV Testing, Males
      0.222689
      0.137849
      1.000000
      0.824896
      0.864737
      -0.099670
      -0.186004
      -0.075539
      -0.086833
      0.250205
      0.624729
    
    
      HIV Testing, Females
      0.385662
      0.238497
      0.824896
      1.000000
      0.740888
      0.171614
      -0.241764
      -0.015288
      -0.182117
      0.134543
      0.474609
    
    
      Campaign_Portuguese
      -0.067183
      -0.140317
      0.864737
      0.740888
      1.000000
      -0.173546
      -0.283673
      -0.126847
      0.155188
      0.157308
      0.720007
    
    
      Discrimination_Negative
      0.419378
      0.284962
      -0.099670
      0.171614
      -0.173546
      1.000000
      -0.236608
      0.124414
      -0.237682
      -0.033660
      -0.591547
    
    
      Discrimination_Positive
      0.188460
      0.362658
      -0.186004
      -0.241764
      -0.283673
      -0.236608
      1.000000
      -0.292206
      -0.264764
      -0.506955
      -0.130878
    
    
      Prevention_Negative
      -0.050735
      -0.056928
      -0.075539
      -0.015288
      -0.126847
      0.124414
      -0.292206
      1.000000
      -0.007390
      0.106317
      -0.302603
    
    
      Prevention_Neutral
      -0.409009
      -0.332257
      -0.086833
      -0.182117
      0.155188
      -0.237682
      -0.264764
      -0.007390
      1.000000
      0.516719
      0.307874
    
    
      Prevention_Positive
      -0.039294
      -0.131103
      0.250205
      0.134543
      0.157308
      -0.033660
      -0.506955
      0.106317
      0.516719
      1.000000
      0.254708
    
    
      Testing_Neutral
      -0.034574
      -0.108993
      0.624729
      0.474609
      0.720007
      -0.591547
      -0.130878
      -0.302603
      0.307874
      0.254708
      1.000000



In [39]:

    
f, ax = plt.subplots(figsize=(12, 12))
sns.corrplot(dfNoTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [40]:

    
dfTotals = df[["Curitiba Total", "Twitter Total"]]
dfTotalsCorr = dfTotals.corr() # Using default method: Pearson
dfTotalsCorr









    Out[40]:






  
    
      
      Curitiba Total
      Twitter Total
    
  
  
    
      Curitiba Total
      1.000000
      0.387275
    
    
      Twitter Total
      0.387275
      1.000000



In [41]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [42]:

    
dfTotals["Curitiba Tests Total"] = dfTotals["Curitiba Total"].sum()
dfTotals["Curitiba Tweets Total"] = dfTotals["Twitter Total"].sum()
dfTotals["Curitiba Tests Total %"] = dfTotals["Curitiba Total"]/dfTotals["Curitiba Tests Total"]*100
dfTotals["Curitiba Tweets Total %"] = dfTotals["Twitter Total"]/dfTotals["Curitiba Tweets Total"]*100









    



/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [43]:

    
dfTotals









    Out[43]:






  
    
      
      Curitiba Total
      Twitter Total
      Curitiba Tests Total
      Curitiba Tweets Total
      Curitiba Tests Total %
      Curitiba Tweets Total %
    
  
  
    
      2014-01-01
      3236
      627
      43208
      7546
      7.489354
      8.309038
    
    
      2014-02-01
      2972
      612
      43208
      7546
      6.878356
      8.110257
    
    
      2014-03-01
      2923
      651
      43208
      7546
      6.764951
      8.627087
    
    
      2014-04-01
      3130
      594
      43208
      7546
      7.244029
      7.871720
    
    
      2014-05-01
      3004
      462
      43208
      7546
      6.952416
      6.122449
    
    
      2014-06-01
      2415
      505
      43208
      7546
      5.589243
      6.692287
    
    
      2014-07-01
      3379
      521
      43208
      7546
      7.820311
      6.904320
    
    
      2014-08-01
      2838
      470
      43208
      7546
      6.568228
      6.228465
    
    
      2014-09-01
      3172
      514
      43208
      7546
      7.341233
      6.811556
    
    
      2014-10-01
      3087
      457
      43208
      7546
      7.144510
      6.056189
    
    
      2014-11-01
      1813
      386
      43208
      7546
      4.195982
      5.115293
    
    
      2014-12-01
      1889
      430
      43208
      7546
      4.371876
      5.698383
    
    
      2015-01-01
      3000
      464
      43208
      7546
      6.943159
      6.148953
    
    
      2015-02-01
      2770
      435
      43208
      7546
      6.410850
      5.764644
    
    
      2015-03-01
      3580
      418
      43208
      7546
      8.285503
      5.539359



In [44]:

    
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'Tests vs. Tweets', x_axis_label = 'Month', # y_axis_label = 'Monthly %',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FFFFFF',
    outline_line_color = '#FFFFFF', border_fill = '#FFFFFF', x_axis_type = 'datetime',
    x_range = (dfTotals.index.min(),dfTotals.index.max()), y_range = (0,10),
    )
fig.line(dfTotals.index, dfTotals["Curitiba Tests Total %"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "HIV Tests in Public Clinics, Curitiba"),
fig.line(dfTotals.index, dfTotals["Curitiba Tweets Total %"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV and Discrimination Tweets, Curitiba")
show(fig)



In [45]:

    
sns.jointplot("Curitiba Total", "Twitter Total", df, kind="reg", color="#404040");



In [46]:

    
sns.jointplot("HIV Testing, Males", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [47]:

    
sns.jointplot("HIV Testing, Females", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [48]:

    
sns.jointplot("Primary Care, Females", "Prevention_Positive", df, kind="reg", color="#404040");

Anomaly Detection

We can only do anomaly detection on tweets, because we don't have enough data points for our ground-truth data. As we have very fine temporal information for all tweets (seconds), we can resample to look at hourly and daily aggregates. That shulkd give us enough data for anomaly detection.

First we resample to daily aggregates.



In [55]:

    
twitterDataDailyAgg = twitterDataSmall.resample('D', how='sum') # Resampling by summing each topic over each day
twitterDataDailyAgg['Twitter Total'] = twitterDataDailyAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataDailyAgg.head()









    Out[55]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
    
      origdate
      
      
      
      
      
      
      
      
    
  
  
    
      2014-01-01
      0
      15
      0
      0
      0
      1
      0
      16
    
    
      2014-01-02
      0
      16
      0
      0
      0
      0
      0
      16
    
    
      2014-01-03
      0
      15
      0
      0
      0
      0
      0
      15
    
    
      2014-01-04
      0
      13
      0
      0
      2
      4
      0
      19
    
    
      2014-01-05
      0
      12
      0
      0
      0
      4
      0
      16

As we'll be using r instead of Python for this, we'll use IPyhon's built-in r interpreter using the so called Magic Functions.



In [56]:

    
%load_ext rmagic









    



/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/extensions/rmagic.py:693: UserWarning: The rmagic extension in IPython is deprecated in favour of rpy2.ipython. If available, that will be loaded instead.
http://rpy.sourceforge.net/
  warnings.warn("The rmagic extension in IPython is deprecated in favour of "

%%R update.packages() install.packages("devtools") devtools::install_github("twitter/AnomalyDetection")



In [57]:

    
%R library(AnomalyDetection)









    Out[57]:





<StrVector - Python:0x7f6b1d9e5bd8 / R:0x3a06f68>
[str, str, str, ..., str, str, str]



In [58]:

    
# %%R
# help(AnomalyDetectionTs)
# help(AnomalyDetectionVec)



In [59]:

    
df_r = twitterDataDailyAgg['Twitter Total']
df_r.to_csv('TwitterDailyAgg.csv', header=['Twitter Total'], date_format='%Y-%m-%d')



In [60]:

    
!cat TwitterDailyAgg.csv | head









    



origdate,Twitter Total
2014-01-01,16.0
2014-01-02,16.0
2014-01-03,15.0
2014-01-04,19.0
2014-01-05,16.0
2014-01-06,21.0
2014-01-07,28.0
2014-01-08,14.0
2014-01-09,14.0



In [61]:

    
%%R

df_r = read.csv("TwitterDailyAgg.csv", stringsAsFactors=FALSE)



In [62]:

    
%R data(df_r)









    Out[62]:





<StrVector - Python:0x7f6b1d9e5878 / R:0x5cf56d8>
[str]



In [63]:

    
%R df_r









    Out[63]:





<DataFrame - Python:0x7f6b1d8fcef0 / R:0x4e64a38>
[StrVector, FloatVector]
  origdate: <class 'rpy2.robjects.vectors.StrVector'>
  <StrVector - Python:0x7f6b1d9795f0 / R:0x5baeb60>
[str, str, str, ..., str, str, str]
  Twitter.Total: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f6b1d9797e8 / R:0x5baf9d0>
[16.000000, 16.000000, 15.000000, ..., 11.000000, 12.000000, 17.000000]



In [64]:

    
%%R
data(df_r)
res = AnomalyDetectionTs(df_r, max_anoms=0.02, direction='both', plot=TRUE)
res$plot









    



Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero
In addition: Warning messages:
1: In data(df_r) : data set ‘df_r’ not found
2: In data(df_r) : data set ‘df_r’ not found
3: In max(ares) : no non-missing arguments to max; returning -Inf
4: In max(ares) : no non-missing arguments to max; returning -Inf
Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero

DELETE



In [51]:

    
%R raw_data









    Out[51]:





array([ <DataFrame - Python:0x7f36222810e0 / R:0x6e78748>
[Float..., IntVe..., IntVe..., ..., IntVe..., IntVe..., IntVe...]
  <no name>: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f3622281488 / R:0x785a320>
[0.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9ea8 / R:0x6611f80>
[       1,        2,        3, ...,       56,       57,       58]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e92d8 / R:0x7aae890>
[      14,       14,       14, ...,       13,       13,       13]
  ...
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e93f8 / R:0x6891040>
[       4,        4,        4, ...,        0,        0,        0]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9488 / R:0x76d8910>
[     268,      268,      268, ...,      278,      278,      278]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9248 / R:0x76e6a40>
[       0,        0,        0, ...,        0,        0,        0],
       <FloatVector - Python:0x7f36222811b8 / R:0x7a673c0>
[182.478000, 176.231000, 183.917000, ..., 153.776000, 150.481000, 146.638000]], dtype=object)



In [52]:

    
%%R

data(raw_data)
res = AnomalyDetectionTs(raw_data, max_anoms=0.02, direction='both', plot=TRUE)
res$plot

Styling



In [1]:

    
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)









    Out[1]:



In [ ]:

	City	State	Population
24	São Paulo	SP	11895893
21	Rio de Janeiro	RJ	6453682
22	Salvador	BA	2902927
4	Brasília	DF	2852372
9	Fortaleza	CE	2571896
2	Belo Horizonte	MG	2491109
14	Manaus	AM	2020301
7	Curitiba	PR	1864416
19	Recife	PE	1608488
17	Porto Alegre	RS	1472482
1	Belém	PA	1432844
10	Goiânia	GO	1412364
23	São Luís	MA	1064197
13	Maceió	AL	1005319
15	Natal	RN	862044
5	Campo Grande	MS	843120
25	Teresina	PI	840600
11	João Pessoa	PB	780738
0	Aracaju	SE	623766
6	Cuiabá	MT	575480
18	Porto Velho	RO	494013
8	Florianópolis	SC	461524
12	Macapá	AP	446757
20	Rio Branco	AC	363928
26	Vitória	ES	352104
3	Boa Vista	RR	314900
16	Palmas	TO	265409

	2014-01	2014-02	2014-03	2014-04	2014-05	2014-06	2014-07	2014-08	2014-09	2014-10	2014-11	2014-12	2015-01	2015-02	2015-03
HIV tests performed
Primary Health Care services (females)	1465	1425	1378	1477	1437	1174	1495	1267	1438	1413	720	686	1352	1277	1628
Primary Health Care services (males)	1057	1056	998	1115	1064	840	1197	1009	1250	1187	577	617	1157	912	1169
Pregnant women in public health care	2760	2360	2095	2258	2473	2086	2640	2020	2571	2499	1564	1666	3027	2130	2414
HIV testing center (males)	528	358	389	383	348	307	490	404	377	370	405	430	379	437	572
HIV testing center (females)	186	133	158	155	155	94	197	158	107	117	111	156	112	144	211

HIV tests performed	Primary Health Care services (females)	Primary Health Care services (males)	Pregnant women in public health care	HIV testing center (males)	HIV testing center (females)
2014-01	1465	1057	2760	528	186
2014-02	1425	1056	2360	358	133
2014-03	1378	998	2095	389	158
2014-04	1477	1115	2258	383	155
2014-05	1437	1064	2473	348	155
2014-06	1174	840	2086	307	94
2014-07	1495	1197	2640	490	197
2014-08	1267	1009	2020	404	158
2014-09	1438	1250	2571	377	107
2014-10	1413	1187	2499	370	117
2014-11	720	577	1564	405	111
2014-12	686	617	1666	430	156
2015-01	1352	1157	3027	379	112
2015-02	1277	912	2130	437	144
2015-03	1628	1169	2414	572	211

	Primary Care, Females	Primary Care, Males	HIV Testing, Males	HIV Testing, Females
2014-01-01	1465	1057	528	186
2014-02-01	1425	1056	358	133
2014-03-01	1378	998	389	158
2014-04-01	1477	1115	383	155
2014-05-01	1437	1064	348	155
2014-06-01	1174	840	307	94
2014-07-01	1495	1197	490	197
2014-08-01	1267	1009	404	158
2014-09-01	1438	1250	377	107
2014-10-01	1413	1187	370	117
2014-11-01	720	577	405	111
2014-12-01	686	617	430	156
2015-01-01	1352	1157	379	112
2015-02-01	1277	912	437	144
2015-03-01	1628	1169	572	211

	Primary Care, Females	Primary Care, Males	HIV Testing, Males	HIV Testing, Females	Curitiba Total
Primary Care, Females	1.000000	0.928283	0.222689	0.385662	0.976710
Primary Care, Males	0.928283	1.000000	0.137849	0.238497	0.944786
HIV Testing, Males	0.222689	0.137849	1.000000	0.824896	0.375919
HIV Testing, Females	0.385662	0.238497	0.824896	1.000000	0.492247
Curitiba Total	0.976710	0.944786	0.375919	0.492247	1.000000

	city	lat	lon	topic
origdate
2014-04-03 21:23:26	Recife	-8.057838	-34.882897	Discrimination_Negative
2015-02-07 23:51:00	São Paulo	-23.500000	-46.600000	Discrimination_Negative
2014-11-12 20:35:08	Curitiba	-25.428954	-49.267137	Discrimination_Negative
2014-04-06 12:06:17	Curitiba	-25.428954	-49.267137	Discrimination_Negative
2014-09-13 15:38:28	São Paulo	-23.500000	-46.600000	Discrimination_Negative

	city	Tweets
24	São Paulo	20908
21	Rio de Janeiro	13271
17	Porto Alegre	10766
7	Curitiba	7546
1	Belo Horizonte	6933
4	Brasília	6492
2	Belém	4257
26	Vitória	3474
8	Florianópolis	3140
19	Recife	3090
9	Fortaleza	2622
14	Manaus	2047
22	Salvador	2036
15	Natal	1759
23	São Luís	1677
5	Campo Grande	1594
3	Boa Vista	1232
13	Maceió	1067
0	Aracaju	904
10	Goiânia	849
12	Macapá	846
6	Cuiabá	808
11	João Pessoa	797
25	Teresina	753
20	Rio Branco	629
18	Porto Velho	450
16	Palmas	186

	city	topic
count	7546	7546
unique	1	7
top	Curitiba	Discrimination_Negative
freq	7546	5697

	Campaign_Portuguese	Discrimination_Negative	Discrimination_Positive	Prevention_Negative	Prevention_Neutral	Prevention_Positive	Testing_Neutral	Twitter Total
Campaign_Portuguese	1.000000	-0.173546	-0.283673	-0.126847	0.155188	0.157308	0.720007	-0.171034
Discrimination_Negative	-0.173546	1.000000	-0.236608	0.124414	-0.237682	-0.033660	-0.591547	0.966719
Discrimination_Positive	-0.283673	-0.236608	1.000000	-0.292206	-0.264764	-0.506955	-0.130878	-0.104225
Prevention_Negative	-0.126847	0.124414	-0.292206	1.000000	-0.007390	0.106317	-0.302603	0.065439
Prevention_Neutral	0.155188	-0.237682	-0.264764	-0.007390	1.000000	0.516719	0.307874	-0.120481
Prevention_Positive	0.157308	-0.033660	-0.506955	0.106317	0.516719	1.000000	0.254708	0.065843
Testing_Neutral	0.720007	-0.591547	-0.130878	-0.302603	0.307874	0.254708	1.000000	-0.552325
Twitter Total	-0.171034	0.966719	-0.104225	0.065439	-0.120481	0.065843	-0.552325	1.000000

	Curitiba Total	Twitter Total	Curitiba Tests Total	Curitiba Tweets Total	Curitiba Tests Total %	Curitiba Tweets Total %
2014-01-01	3236	627	43208	7546	7.489354	8.309038
2014-02-01	2972	612	43208	7546	6.878356	8.110257
2014-03-01	2923	651	43208	7546	6.764951	8.627087
2014-04-01	3130	594	43208	7546	7.244029	7.871720
2014-05-01	3004	462	43208	7546	6.952416	6.122449
2014-06-01	2415	505	43208	7546	5.589243	6.692287
2014-07-01	3379	521	43208	7546	7.820311	6.904320
2014-08-01	2838	470	43208	7546	6.568228	6.228465
2014-09-01	3172	514	43208	7546	7.341233	6.811556
2014-10-01	3087	457	43208	7546	7.144510	6.056189
2014-11-01	1813	386	43208	7546	4.195982	5.115293
2014-12-01	1889	430	43208	7546	4.371876	5.698383
2015-01-01	3000	464	43208	7546	6.943159	6.148953
2015-02-01	2770	435	43208	7546	6.410850	5.764644
2015-03-01	3580	418	43208	7546	8.285503	5.539359

	Campaign_Portuguese	Discrimination_Negative	Discrimination_Positive	Prevention_Negative	Prevention_Neutral	Prevention_Positive	Testing_Neutral	Twitter Total
origdate
2014-01-01	0	15	0	0	0	1	0	16
2014-01-02	0	16	0	0	0	0	0	16
2014-01-03	0	15	0	0	0	0	0	15
2014-01-04	0	13	0	0	2	4	0	19
2014-01-05	0	12	0	0	0	4	0	16