Setting the Stage



In [2]:

    
# Importing the Python libraries we will use below #
import sys
import numpy as np
import scipy as scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import ggplot as gg
import seaborn as sns
from bokeh.plotting import *



In [3]:

    
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark -a 'Rene Clausen Nielsen, UN Global Pulse' -p pandas,numpy,scipy,geolocator,ggplot,matplotlib,mpld3,seaborn,bokeh -d -n -t -z -v -m -g









    



Rene Clausen Nielsen, UN Global Pulse 28/05/2015 14:37:26 UTC

CPython 2.7.9
IPython 3.1.0

pandas 0.16.1
numpy 1.9.2
scipy 0.15.1
geolocator 0.2.dev0
ggplot 0.6.5
matplotlib 1.4.3
mpld3 0.2
seaborn 0.5.1
bokeh 0.8.2

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 3.13.0-46-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   :



In [4]:

    
# Setting the chosen graphical styles #
%matplotlib inline
output_notebook()
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)
sns.set_context("poster")
sns.despine()
sns.set_style("whitegrid", {'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10, 'axes.titlesize': 12, 
                        'xtick.labelsize': 10, 'ytick.labelsize': 10, 'grid.linewidth': .2, 'axes.facecolor': ".97",
                        'grid.color': '.9', 'axes.edgecolor': '.9', 'font.family': ['sans-serif'], 'lines.solid_capstyle': 'round',
                        'font.sans-serif': ['Liberation Sans','Bitstream Vera Sans','sans-serif','Arial'],})









    




    
        
        
        
    
        
        BokehJS successfully loaded.
    






    





<matplotlib.figure.Figure at 0x7f3bad68aad0>

Getting Population Data



In [5]:

    
populationData = pd.read_csv('../data/BrazilPopulation.csv', encoding='utf-8',)
populationData.sort("Population", ascending=False)









    Out[5]:






  
    
      
      City
      State
      Population
    
  
  
    
      24
      São Paulo
      SP
      11152968
    
    
      21
      Rio de Janeiro
      RJ
      6320446
    
    
      22
      Salvador
      BA
      2674923
    
    
      4
      Brasília
      DF
      2481272
    
    
      9
      Fortaleza
      CE
      2452185
    
    
      2
      Belo Horizonte
      MG
      2375151
    
    
      14
      Manaus
      AM
      1792881
    
    
      7
      Curitiba
      PR
      1751907
    
    
      19
      Recife
      PE
      1537704
    
    
      17
      Porto Alegre
      RS
      1409351
    
    
      1
      Belém
      PA
      1381475
    
    
      10
      Goiânia
      GO
      1297154
    
    
      23
      São Luís
      MA
      958545
    
    
      13
      Maceió
      AL
      932078
    
    
      15
      Natal
      RN
      803739
    
    
      5
      Campo Grande
      MS
      776242
    
    
      25
      Teresina
      PI
      767559
    
    
      11
      João Pessoa
      PB
      720954
    
    
      0
      Aracaju
      SE
      571149
    
    
      6
      Cuiabá
      MT
      540814
    
    
      8
      Florianópolis
      SC
      405189
    
    
      18
      Porto Velho
      RO
      392475
    
    
      12
      Macapá
      AP
      381091
    
    
      26
      Vitória
      ES
      327801
    
    
      20
      Rio Branco
      AC
      308545
    
    
      3
      Boa Vista
      RR
      277799
    
    
      16
      Palmas
      TO
      221742



In [6]:

    
populationData.sort("Population").plot(x = "City",
                   y = "Population",
                   kind = "barh", 
                   title = "Number of Inhabitants per City",
                   legend = False,
                   # colormap = cmap,
                   color = "#00aeef",
                   alpha=0.8)









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3bad6cccd0>

Getting the Ground-Truth Data



In [7]:

    
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba-dasa.xls")



In [8]:

    
curitibaData.sheet_names









    Out[8]:





[u'Curitiba HIV Tests (Public)', u'Curitiba HIV Tests (Private)']



In [9]:

    
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True, index_col=0)



In [10]:

    
curitibaPublic









    Out[10]:






  
    
      
      2014-01
      2014-02
      2014-03
      2014-04
      2014-05
      2014-06
      2014-07
      2014-08
      2014-09
      2014-10
      2014-11
      2014-12
      2015-01
      2015-02
      2015-03
    
    
      HIV tests performed
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Primary Health Care services (females)
      1465
      1425
      1378
      1477
      1437
      1174
      1495
      1267
      1438
      1413
      720
      686
      1352
      1277
      1628
    
    
      Primary Health Care services (males)
      1057
      1056
      998
      1115
      1064
      840
      1197
      1009
      1250
      1187
      577
      617
      1157
      912
      1169
    
    
      Pregnant women in public health care
      2760
      2360
      2095
      2258
      2473
      2086
      2640
      2020
      2571
      2499
      1564
      1666
      3027
      2130
      2414
    
    
      HIV testing center (males)
      528
      358
      389
      383
      348
      307
      490
      404
      377
      370
      405
      430
      379
      437
      572
    
    
      HIV testing center (females)
      186
      133
      158
      155
      155
      94
      197
      158
      107
      117
      111
      156
      112
      144
      211
    
    
      Private lab: DASA Total
      4612
      4560
      4620
      4527
      4799
      4156
      4801
      4695
      4408
      4726
      4377
      3612
      4405
      4256
      5206



In [11]:

    
# curitibaPublic = curitibaPublic.loc[:,'2014-06':'2014-09'] # Including only months where we also have Twitter data 
# curitibaPublic

Turning the Table



In [12]:

    
curitibaPublic = curitibaPublic.transpose()
curitibaPublic









    Out[12]:






  
    
      HIV tests performed
      Primary Health Care services (females)
      Primary Health Care services (males)
      Pregnant women in public health care
      HIV testing center (males)
      HIV testing center (females)
      Private lab: DASA Total
    
  
  
    
      2014-01
      1465
      1057
      2760
      528
      186
      4612
    
    
      2014-02
      1425
      1056
      2360
      358
      133
      4560
    
    
      2014-03
      1378
      998
      2095
      389
      158
      4620
    
    
      2014-04
      1477
      1115
      2258
      383
      155
      4527
    
    
      2014-05
      1437
      1064
      2473
      348
      155
      4799
    
    
      2014-06
      1174
      840
      2086
      307
      94
      4156
    
    
      2014-07
      1495
      1197
      2640
      490
      197
      4801
    
    
      2014-08
      1267
      1009
      2020
      404
      158
      4695
    
    
      2014-09
      1438
      1250
      2571
      377
      107
      4408
    
    
      2014-10
      1413
      1187
      2499
      370
      117
      4726
    
    
      2014-11
      720
      577
      1564
      405
      111
      4377
    
    
      2014-12
      686
      617
      1666
      430
      156
      3612
    
    
      2015-01
      1352
      1157
      3027
      379
      112
      4405
    
    
      2015-02
      1277
      912
      2130
      437
      144
      4256
    
    
      2015-03
      1628
      1169
      2414
      572
      211
      5206



In [13]:

    
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
                          'HIV Testing, Females', 'Private lab: DASA Total'] # Giving the columns fairly short and explanatory names
curitibaPublic = curitibaPublic.drop('Pregnant Women', 1) # Excluding numbers from matenal health facilities as they are tests delivered, not tests made
curitibaPublic









    Out[13]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Private lab: DASA Total
    
  
  
    
      2014-01
      1465
      1057
      528
      186
      4612
    
    
      2014-02
      1425
      1056
      358
      133
      4560
    
    
      2014-03
      1378
      998
      389
      158
      4620
    
    
      2014-04
      1477
      1115
      383
      155
      4527
    
    
      2014-05
      1437
      1064
      348
      155
      4799
    
    
      2014-06
      1174
      840
      307
      94
      4156
    
    
      2014-07
      1495
      1197
      490
      197
      4801
    
    
      2014-08
      1267
      1009
      404
      158
      4695
    
    
      2014-09
      1438
      1250
      377
      107
      4408
    
    
      2014-10
      1413
      1187
      370
      117
      4726
    
    
      2014-11
      720
      577
      405
      111
      4377
    
    
      2014-12
      686
      617
      430
      156
      3612
    
    
      2015-01
      1352
      1157
      379
      112
      4405
    
    
      2015-02
      1277
      912
      437
      144
      4256
    
    
      2015-03
      1628
      1169
      572
      211
      5206



In [14]:

    
curitibaPublic.dtypes # Figuring out what datatype each column is read as









    Out[14]:





Primary Care, Females      int64
Primary Care, Males        int64
HIV Testing, Males         int64
HIV Testing, Females       int64
Private lab: DASA Total    int64
dtype: object



In [15]:

    
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such



In [16]:

    
curitibaPublic # Checking that data looks the same after the datatype shenanigans









    Out[16]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Private lab: DASA Total
    
  
  
    
      2014-01-01
      1465
      1057
      528
      186
      4612
    
    
      2014-02-01
      1425
      1056
      358
      133
      4560
    
    
      2014-03-01
      1378
      998
      389
      158
      4620
    
    
      2014-04-01
      1477
      1115
      383
      155
      4527
    
    
      2014-05-01
      1437
      1064
      348
      155
      4799
    
    
      2014-06-01
      1174
      840
      307
      94
      4156
    
    
      2014-07-01
      1495
      1197
      490
      197
      4801
    
    
      2014-08-01
      1267
      1009
      404
      158
      4695
    
    
      2014-09-01
      1438
      1250
      377
      107
      4408
    
    
      2014-10-01
      1413
      1187
      370
      117
      4726
    
    
      2014-11-01
      720
      577
      405
      111
      4377
    
    
      2014-12-01
      686
      617
      430
      156
      3612
    
    
      2015-01-01
      1352
      1157
      379
      112
      4405
    
    
      2015-02-01
      1277
      912
      437
      144
      4256
    
    
      2015-03-01
      1628
      1169
      572
      211
      5206



In [17]:

    
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic









    Out[17]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Private lab: DASA Total
      Curitiba Total
    
  
  
    
      2014-01-01
      1465
      1057
      528
      186
      4612
      7848
    
    
      2014-02-01
      1425
      1056
      358
      133
      4560
      7532
    
    
      2014-03-01
      1378
      998
      389
      158
      4620
      7543
    
    
      2014-04-01
      1477
      1115
      383
      155
      4527
      7657
    
    
      2014-05-01
      1437
      1064
      348
      155
      4799
      7803
    
    
      2014-06-01
      1174
      840
      307
      94
      4156
      6571
    
    
      2014-07-01
      1495
      1197
      490
      197
      4801
      8180
    
    
      2014-08-01
      1267
      1009
      404
      158
      4695
      7533
    
    
      2014-09-01
      1438
      1250
      377
      107
      4408
      7580
    
    
      2014-10-01
      1413
      1187
      370
      117
      4726
      7813
    
    
      2014-11-01
      720
      577
      405
      111
      4377
      6190
    
    
      2014-12-01
      686
      617
      430
      156
      3612
      5501
    
    
      2015-01-01
      1352
      1157
      379
      112
      4405
      7405
    
    
      2015-02-01
      1277
      912
      437
      144
      4256
      7026
    
    
      2015-03-01
      1628
      1169
      572
      211
      5206
      8786

Looking at the Ground-Truth Data

Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)



In [18]:

    
for col in curitibaPublic:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = curitibaPublic[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (curitibaPublic.index.min(),
                   curitibaPublic.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(curitibaPublic[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        curitibaPublic.index,                               # Variable values for the x-axis (index = dates)
        curitibaPublic[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
   # legend().label_text_font='Open Sans'
   # legend().label_text_color='#363636'
   # legend().border_line_color='#f6f6f6'
   # axis().axis_label_text_font = "Open Sans"
   # axis().axis_label_text_font_size = "12pt"
   # axis().axis_label_text_color = "#363636"
   # axis().major_label_text_font="Open Sans"
   # axis().major_label_text_font_size="10pt"
   # axis().minor_tick_line_color = "#d4d4d4"
   # xaxis().axis_line_color = '#d4d4d4'
   # xaxis().major_tick_line_color = "#d4d4d4"
   # yaxis().major_tick_line_color = None
   # yaxis().axis_line_color = None
   # xgrid().grid_line_color = None
   # ygrid().grid_line_color = "#d4d4d4"
    show(fig)

Below we'll insert lines for all topics in one chart to better compare.



In [19]:

    
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Groups', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,5300),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
fig.line(curitibaPublic.index, curitibaPublic["Private lab: DASA Total"], line_color = '#5d6263', line_width = 5, line_alpha = 0.7,
         legend = "DASA")
#legend().label_text_font='Open Sans'
#legend().label_text_color='#363636'
#legend().border_line_color='#f6f6f6'
#axis().axis_label_text_font = "Open Sans"
#axis().axis_label_text_font_size = "12pt"
#axis().axis_label_text_color = "#363636"
#axis().major_label_text_font="Open Sans"
#axis().major_label_text_font_size="10pt"
#axis().minor_tick_line_color = "#d4d4d4"
#xaxis().axis_line_color = '#d4d4d4'
#xaxis().major_tick_line_color = "#d4d4d4"
#yaxis().major_tick_line_color = None
#yaxis().axis_line_color = None
#xgrid().grid_line_color = None
#ygrid().grid_line_color = "#d4d4d4"
#ygrid().grid_line_width = 0.5
show(fig)

Correlation Between Test Groups

We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.

First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.

As we're currently looking at timeseries correlations, we'll just use the default: Pearson.

Normal Distribution Test

Pearson assumes that the data is normal distributed. We can't really test that with only four data points per series, but the code below has been readied for furture use.



In [20]:

    
normalTestPrimaryFemales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Females"])
normalTestPrimaryMales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Males"])
normalTestHIVTestFemales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Females"])
normalTestHIVTestMales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Males"])
normalTestTotal = scipy.stats.mstats.normaltest(curitibaPublic["Curitiba Total"])

print('Normal Distribution Test for "Primary Care, Females": %s' % (normalTestPrimaryFemales,))
print('Normal Distribution Test for "Primary Care, Males": %s' % (normalTestPrimaryMales,))
print('Normal Distribution Test for "HIV Testing, Females": %s' % (normalTestHIVTestFemales,))
print('Normal Distribution Test for "HIV Testing, Males": %s' % (normalTestHIVTestMales,))
print('Normal Distribution Test for "Curitiba Total": %s' % (normalTestTotal,))

#curitibaPublic["Primary Care, Females"].normaltest()









    



Normal Distribution Test for "Primary Care, Females": (10.183630689925293, 0.0061468510974571266)
Normal Distribution Test for "Primary Care, Males": (4.8762898611677077, 0.087322690957151297)
Normal Distribution Test for "HIV Testing, Females": (0.54129509083924487, 0.76288533145558435)
Normal Distribution Test for "HIV Testing, Males": (3.9888908656263116, 0.13608910379542336)
Normal Distribution Test for "Curitiba Total": (4.0607484299447947, 0.13128638262483081)






    



/home/ubuntu/anaconda/lib/python2.7/site-packages/scipy/stats/mstats_basic.py:1613: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=15
  np.min(n))

Histograms

As is the case with narmaltests above, we don't really have enough data points for a histogram to be useful as a visual indicator of being normal distributed, but again, for future use.

(gg.ggplot(gg.aes(x="Primary Care, Females"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="Primary Care, Males"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="HIV Testing, Females"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="HIV Testing, Males"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="Curitiba Total"), data=curitibaPublic) + gg.geom_histogram())

Correlation Matrix



In [21]:

    
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr









    Out[21]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Private lab: DASA Total
      Curitiba Total
    
  
  
    
      Primary Care, Females
      1.000000
      0.928283
      0.222689
      0.385662
      0.770246
      0.942318
    
    
      Primary Care, Males
      0.928283
      1.000000
      0.137849
      0.238497
      0.674439
      0.880188
    
    
      HIV Testing, Males
      0.222689
      0.137849
      1.000000
      0.824896
      0.378382
      0.399023
    
    
      HIV Testing, Females
      0.385662
      0.238497
      0.824896
      1.000000
      0.496491
      0.522953
    
    
      Private lab: DASA Total
      0.770246
      0.674439
      0.378382
      0.496491
      1.000000
      0.922685
    
    
      Curitiba Total
      0.942318
      0.880188
      0.399023
      0.522953
      0.922685
      1.000000

curitibaPublicCorrKendall = curitibaPublic.corr(method='kendall') # Using kendall curitibaPublicCorrKendall



In [22]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublic, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()

Main Group Correlations



In [23]:

    
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");

Getting the Twitter Data



In [24]:

    
# Checking that the data file looks right #
!cat spark/output-final/all.csv | head









    



city,lat,lon,origdate,topic









cat: write error: Broken pipe



In [25]:

    
# Read in Twitter data file #
twitterData=pd.read_csv('spark/output-final/all.csv',
                          encoding='utf-8',
                          #header=None,
                          na_values=['NaN',''],
                          parse_dates=[3],
                          index_col=[3]
                        )



In [26]:

    
twitterData.head()









    Out[26]:






  
    
      
      city
      lat
      lon
      topic
    
    
      origdate
      
      
      
      
    
  
  
    
      2014-04-03 21:23:26
      Recife
      -8.057838
      -34.882897
      Discrimination_Negative
    
    
      2015-02-07 23:51:00
      São Paulo
      -23.500000
      -46.600000
      Discrimination_Negative
    
    
      2014-11-12 20:35:08
      Curitiba
      -25.428954
      -49.267137
      Discrimination_Negative
    
    
      2014-04-06 12:06:17
      Curitiba
      -25.428954
      -49.267137
      Discrimination_Negative
    
    
      2014-09-13 15:38:28
      São Paulo
      -23.500000
      -46.600000
      Discrimination_Negative



In [27]:

    
twitterDataCounts = pd.DataFrame({"Tweets" : twitterData.groupby(["city"]).size()}).reset_index()
twitterDataCounts.sort("Tweets", ascending=False)









    Out[27]:






  
    
      
      city
      Tweets
    
  
  
    
      24
      São Paulo
      20908
    
    
      21
      Rio de Janeiro
      13271
    
    
      17
      Porto Alegre
      10766
    
    
      7
      Curitiba
      7546
    
    
      1
      Belo Horizonte
      6933
    
    
      4
      Brasília
      6492
    
    
      2
      Belém
      4257
    
    
      26
      Vitória
      3474
    
    
      8
      Florianópolis
      3140
    
    
      19
      Recife
      3090
    
    
      9
      Fortaleza
      2622
    
    
      14
      Manaus
      2047
    
    
      22
      Salvador
      2036
    
    
      15
      Natal
      1759
    
    
      23
      São Luís
      1677
    
    
      5
      Campo Grande
      1594
    
    
      3
      Boa Vista
      1232
    
    
      13
      Maceió
      1067
    
    
      0
      Aracaju
      904
    
    
      10
      Goiânia
      849
    
    
      12
      Macapá
      846
    
    
      6
      Cuiabá
      808
    
    
      11
      João Pessoa
      797
    
    
      25
      Teresina
      753
    
    
      20
      Rio Branco
      629
    
    
      18
      Porto Velho
      450
    
    
      16
      Palmas
      186



In [28]:

    
twitterDataCounts.sort("Tweets").plot(x = "city", y = "Tweets", kind = "barh", title = "Number of Tweets per City",
                                legend = False, color="#00aeef", alpha=0.8)









    Out[28]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3bad2a8590>



In [29]:

    
cityData = pd.merge(twitterDataCounts, populationData, how='outer', left_on="city", right_on="City", copy=True)
cityData = cityData[["City","Population","Tweets"]]
cityData









    Out[29]:






  
    
      
      City
      Population
      Tweets
    
  
  
    
      0
      Aracaju
      571149
      904
    
    
      1
      Belo Horizonte
      2375151
      6933
    
    
      2
      Belém
      1381475
      4257
    
    
      3
      Boa Vista
      277799
      1232
    
    
      4
      Brasília
      2481272
      6492
    
    
      5
      Campo Grande
      776242
      1594
    
    
      6
      Cuiabá
      540814
      808
    
    
      7
      Curitiba
      1751907
      7546
    
    
      8
      Florianópolis
      405189
      3140
    
    
      9
      Fortaleza
      2452185
      2622
    
    
      10
      Goiânia
      1297154
      849
    
    
      11
      João Pessoa
      720954
      797
    
    
      12
      Macapá
      381091
      846
    
    
      13
      Maceió
      932078
      1067
    
    
      14
      Manaus
      1792881
      2047
    
    
      15
      Natal
      803739
      1759
    
    
      16
      Palmas
      221742
      186
    
    
      17
      Porto Alegre
      1409351
      10766
    
    
      18
      Porto Velho
      392475
      450
    
    
      19
      Recife
      1537704
      3090
    
    
      20
      Rio Branco
      308545
      629
    
    
      21
      Rio de Janeiro
      6320446
      13271
    
    
      22
      Salvador
      2674923
      2036
    
    
      23
      São Luís
      958545
      1677
    
    
      24
      São Paulo
      11152968
      20908
    
    
      25
      Teresina
      767559
      753
    
    
      26
      Vitória
      327801
      3474



In [47]:

    
cityData["Tweets per 1,000 inhabitants"] = cityData["Tweets"]/cityData["Population"]*1000
cityData.sort("Tweets per 1,000 inhabitants").plot(x = "City", y = "Tweets per 1,000 inhabitants", kind = "barh", 
                                                   title = "Tweets per 1,000 inhabitants", legend = False,
                                                  color="#00aeef", alpha=0.7)









    Out[47]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3bacc348d0>

Case: Curitiba



In [31]:

    
twitterDataCuritiba = twitterData[twitterData['city'] == 'Curitiba'] # Getting Curitiba data only
twitterDataSmall = twitterDataCuritiba[['city','topic']] # Getting rid of columns we won't need
twitterDataSmall.head()









    Out[31]:






  
    
      
      city
      topic
    
    
      origdate
      
      
    
  
  
    
      2014-11-12 20:35:08
      Curitiba
      Discrimination_Negative
    
    
      2014-04-06 12:06:17
      Curitiba
      Discrimination_Negative
    
    
      2014-09-19 02:54:41
      Curitiba
      Discrimination_Negative
    
    
      2014-03-25 19:49:02
      Curitiba
      Discrimination_Negative
    
    
      2014-04-22 06:39:53
      Curitiba
      Discrimination_Negative



In [32]:

    
twitterDataSmall.describe()









    Out[32]:






  
    
      
      city
      topic
    
  
  
    
      count
      7546
      7546
    
    
      unique
      1
      7
    
    
      top
      Curitiba
      Discrimination_Negative
    
    
      freq
      7546
      5697



In [33]:

    
twitterDataSmall = pd.get_dummies(twitterDataSmall['topic'])
twitterDataSmall.head()









    Out[33]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
    
    
      origdate
      
      
      
      
      
      
      
    
  
  
    
      2014-11-12 20:35:08
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-04-06 12:06:17
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-09-19 02:54:41
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-03-25 19:49:02
      0
      1
      0
      0
      0
      0
      0
    
    
      2014-04-22 06:39:53
      0
      1
      0
      0
      0
      0
      0



In [34]:

    
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg









    Out[34]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
    
      origdate
      
      
      
      
      
      
      
      
    
  
  
    
      2014-01-01
      6
      540
      12
      1
      6
      61
      1
      627
    
    
      2014-02-01
      0
      498
      38
      2
      11
      63
      0
      612
    
    
      2014-03-01
      2
      509
      39
      2
      21
      78
      0
      651
    
    
      2014-04-01
      0
      472
      53
      1
      11
      56
      1
      594
    
    
      2014-05-01
      0
      371
      38
      1
      5
      46
      1
      462
    
    
      2014-06-01
      0
      415
      18
      1
      9
      61
      1
      505
    
    
      2014-07-01
      5
      401
      12
      2
      12
      87
      2
      521
    
    
      2014-08-01
      1
      349
      59
      1
      5
      53
      2
      470
    
    
      2014-09-01
      0
      349
      107
      0
      7
      51
      0
      514
    
    
      2014-10-01
      0
      344
      62
      3
      8
      40
      0
      457
    
    
      2014-11-01
      1
      265
      30
      2
      12
      75
      1
      386
    
    
      2014-12-01
      8
      302
      36
      1
      28
      51
      4
      430
    
    
      2015-01-01
      1
      324
      27
      1
      31
      77
      3
      464
    
    
      2015-02-01
      3
      274
      37
      1
      26
      89
      5
      435
    
    
      2015-03-01
      9
      284
      52
      1
      5
      60
      7
      418

Looking at the Twitter Data



In [35]:

    
for col in twitterDataSmallAgg:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = twitterDataSmallAgg[col].name,                   # Plot title
        y_axis_label = 'Tweets',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (twitterDataSmallAgg.index.min(),
                   twitterDataSmallAgg.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        twitterDataSmallAgg.index,                               # Variable values for the x-axis (index = dates)
        twitterDataSmallAgg[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    #legend().label_text_font='Open Sans'
    #legend().label_text_color='#363636'
    #legend().border_line_color='#f6f6f6'
    #axis().axis_label_text_font = "Open Sans"
    #axis().axis_label_text_font_size = "12pt"
    #axis().axis_label_text_color = "#363636"
    #axis().major_label_text_font="Open Sans"
    #axis().major_label_text_font_size="10pt"
    #axis().minor_tick_line_color = "#d4d4d4"
    #xaxis().axis_line_color = '#d4d4d4'
    #xaxis().major_tick_line_color = "#d4d4d4"
    #yaxis().major_tick_line_color = None
    #yaxis().axis_line_color = None
    #xgrid().grid_line_color = None
    #ygrid().grid_line_color = "#d4d4d4"
    #ygrid().grid_line_width = 0.5
    show(fig)



In [36]:

    
curitibaTwitterCorr = twitterDataSmallAgg.corr() # Using default method: Pearson
curitibaTwitterCorr









    Out[36]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
  
  
    
      Campaign_Portuguese
      1.000000
      -0.173546
      -0.283673
      -0.126847
      0.155188
      0.157308
      0.720007
      -0.171034
    
    
      Discrimination_Negative
      -0.173546
      1.000000
      -0.236608
      0.124414
      -0.237682
      -0.033660
      -0.591547
      0.966719
    
    
      Discrimination_Positive
      -0.283673
      -0.236608
      1.000000
      -0.292206
      -0.264764
      -0.506955
      -0.130878
      -0.104225
    
    
      Prevention_Negative
      -0.126847
      0.124414
      -0.292206
      1.000000
      -0.007390
      0.106317
      -0.302603
      0.065439
    
    
      Prevention_Neutral
      0.155188
      -0.237682
      -0.264764
      -0.007390
      1.000000
      0.516719
      0.307874
      -0.120481
    
    
      Prevention_Positive
      0.157308
      -0.033660
      -0.506955
      0.106317
      0.516719
      1.000000
      0.254708
      0.065843
    
    
      Testing_Neutral
      0.720007
      -0.591547
      -0.130878
      -0.302603
      0.307874
      0.254708
      1.000000
      -0.552325
    
    
      Twitter Total
      -0.171034
      0.966719
      -0.104225
      0.065439
      -0.120481
      0.065843
      -0.552325
      1.000000

f, ax = plt.subplots(figsize=(8, 8)) sns.corrplot(twitterDataSmallAgg, annot=False, sig_stars=True, diag_names=False, cmap=cmap, ax=ax) f.tight_layout()

Merging Data



In [37]:

    
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True,
      suffixes=('_x', '_y'), copy=True)



In [38]:

    
df









    Out[38]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Private lab: DASA Total
      Curitiba Total
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
  
  
    
      2014-01-01
      1465
      1057
      528
      186
      4612
      7848
      6
      540
      12
      1
      6
      61
      1
      627
    
    
      2014-02-01
      1425
      1056
      358
      133
      4560
      7532
      0
      498
      38
      2
      11
      63
      0
      612
    
    
      2014-03-01
      1378
      998
      389
      158
      4620
      7543
      2
      509
      39
      2
      21
      78
      0
      651
    
    
      2014-04-01
      1477
      1115
      383
      155
      4527
      7657
      0
      472
      53
      1
      11
      56
      1
      594
    
    
      2014-05-01
      1437
      1064
      348
      155
      4799
      7803
      0
      371
      38
      1
      5
      46
      1
      462
    
    
      2014-06-01
      1174
      840
      307
      94
      4156
      6571
      0
      415
      18
      1
      9
      61
      1
      505
    
    
      2014-07-01
      1495
      1197
      490
      197
      4801
      8180
      5
      401
      12
      2
      12
      87
      2
      521
    
    
      2014-08-01
      1267
      1009
      404
      158
      4695
      7533
      1
      349
      59
      1
      5
      53
      2
      470
    
    
      2014-09-01
      1438
      1250
      377
      107
      4408
      7580
      0
      349
      107
      0
      7
      51
      0
      514
    
    
      2014-10-01
      1413
      1187
      370
      117
      4726
      7813
      0
      344
      62
      3
      8
      40
      0
      457
    
    
      2014-11-01
      720
      577
      405
      111
      4377
      6190
      1
      265
      30
      2
      12
      75
      1
      386
    
    
      2014-12-01
      686
      617
      430
      156
      3612
      5501
      8
      302
      36
      1
      28
      51
      4
      430
    
    
      2015-01-01
      1352
      1157
      379
      112
      4405
      7405
      1
      324
      27
      1
      31
      77
      3
      464
    
    
      2015-02-01
      1277
      912
      437
      144
      4256
      7026
      3
      274
      37
      1
      26
      89
      5
      435
    
    
      2015-03-01
      1628
      1169
      572
      211
      5206
      8786
      9
      284
      52
      1
      5
      60
      7
      418

Comparisons



In [39]:

    
dfNoTotals = df.drop(df.columns[[5, 13]], axis=1)
dfNoTotalsCorr = dfNoTotals.corr() # Using default method: Pearson
dfNoTotalsCorr









    Out[39]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Private lab: DASA Total
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
    
  
  
    
      Primary Care, Females
      1.000000
      0.928283
      0.222689
      0.385662
      0.770246
      -0.067183
      0.419378
      0.188460
      -0.050735
      -0.409009
      -0.039294
      -0.034574
    
    
      Primary Care, Males
      0.928283
      1.000000
      0.137849
      0.238497
      0.674439
      -0.140317
      0.284962
      0.362658
      -0.056928
      -0.332257
      -0.131103
      -0.108993
    
    
      HIV Testing, Males
      0.222689
      0.137849
      1.000000
      0.824896
      0.378382
      0.864737
      -0.099670
      -0.186004
      -0.075539
      -0.086833
      0.250205
      0.624729
    
    
      HIV Testing, Females
      0.385662
      0.238497
      0.824896
      1.000000
      0.496491
      0.740888
      0.171614
      -0.241764
      -0.015288
      -0.182117
      0.134543
      0.474609
    
    
      Private lab: DASA Total
      0.770246
      0.674439
      0.378382
      0.496491
      1.000000
      0.018468
      0.156060
      0.107902
      0.224829
      -0.610046
      -0.033485
      0.035040
    
    
      Campaign_Portuguese
      -0.067183
      -0.140317
      0.864737
      0.740888
      0.018468
      1.000000
      -0.173546
      -0.283673
      -0.126847
      0.155188
      0.157308
      0.720007
    
    
      Discrimination_Negative
      0.419378
      0.284962
      -0.099670
      0.171614
      0.156060
      -0.173546
      1.000000
      -0.236608
      0.124414
      -0.237682
      -0.033660
      -0.591547
    
    
      Discrimination_Positive
      0.188460
      0.362658
      -0.186004
      -0.241764
      0.107902
      -0.283673
      -0.236608
      1.000000
      -0.292206
      -0.264764
      -0.506955
      -0.130878
    
    
      Prevention_Negative
      -0.050735
      -0.056928
      -0.075539
      -0.015288
      0.224829
      -0.126847
      0.124414
      -0.292206
      1.000000
      -0.007390
      0.106317
      -0.302603
    
    
      Prevention_Neutral
      -0.409009
      -0.332257
      -0.086833
      -0.182117
      -0.610046
      0.155188
      -0.237682
      -0.264764
      -0.007390
      1.000000
      0.516719
      0.307874
    
    
      Prevention_Positive
      -0.039294
      -0.131103
      0.250205
      0.134543
      -0.033485
      0.157308
      -0.033660
      -0.506955
      0.106317
      0.516719
      1.000000
      0.254708
    
    
      Testing_Neutral
      -0.034574
      -0.108993
      0.624729
      0.474609
      0.035040
      0.720007
      -0.591547
      -0.130878
      -0.302603
      0.307874
      0.254708
      1.000000



In [40]:

    
f, ax = plt.subplots(figsize=(12, 12))
sns.corrplot(dfNoTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [41]:

    
dfTotals = df[["Curitiba Total", "Twitter Total"]]
dfTotalsCorr = dfTotals.corr() # Using default method: Pearson
dfTotalsCorr









    Out[41]:






  
    
      
      Curitiba Total
      Twitter Total
    
  
  
    
      Curitiba Total
      1.000000
      0.296961
    
    
      Twitter Total
      0.296961
      1.000000



In [42]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [43]:

    
dfTotals["Curitiba Tests Total"] = dfTotals["Curitiba Total"].sum()
dfTotals["Curitiba Tweets Total"] = dfTotals["Twitter Total"].sum()
dfTotals["Curitiba Tests Total %"] = dfTotals["Curitiba Total"]/dfTotals["Curitiba Tests Total"]*100
dfTotals["Curitiba Tweets Total %"] = dfTotals["Twitter Total"]/dfTotals["Curitiba Tweets Total"]*100









    



/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [44]:

    
dfTotals









    Out[44]:






  
    
      
      Curitiba Total
      Twitter Total
      Curitiba Tests Total
      Curitiba Tweets Total
      Curitiba Tests Total %
      Curitiba Tweets Total %
    
  
  
    
      2014-01-01
      7848
      627
      110968
      7546
      7.072309
      8.309038
    
    
      2014-02-01
      7532
      612
      110968
      7546
      6.787542
      8.110257
    
    
      2014-03-01
      7543
      651
      110968
      7546
      6.797455
      8.627087
    
    
      2014-04-01
      7657
      594
      110968
      7546
      6.900187
      7.871720
    
    
      2014-05-01
      7803
      462
      110968
      7546
      7.031757
      6.122449
    
    
      2014-06-01
      6571
      505
      110968
      7546
      5.921527
      6.692287
    
    
      2014-07-01
      8180
      521
      110968
      7546
      7.371494
      6.904320
    
    
      2014-08-01
      7533
      470
      110968
      7546
      6.788444
      6.228465
    
    
      2014-09-01
      7580
      514
      110968
      7546
      6.830798
      6.811556
    
    
      2014-10-01
      7813
      457
      110968
      7546
      7.040769
      6.056189
    
    
      2014-11-01
      6190
      386
      110968
      7546
      5.578185
      5.115293
    
    
      2014-12-01
      5501
      430
      110968
      7546
      4.957285
      5.698383
    
    
      2015-01-01
      7405
      464
      110968
      7546
      6.673095
      6.148953
    
    
      2015-02-01
      7026
      435
      110968
      7546
      6.331555
      5.764644
    
    
      2015-03-01
      8786
      418
      110968
      7546
      7.917598
      5.539359



In [45]:

    
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'Tests vs. Tweets', y_axis_label = 'Monthly %', x_axis_label = 'Month',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (dfTotals.index.min(),dfTotals.index.max()), y_range = (0,10),
    )
fig.line(dfTotals.index, dfTotals["Curitiba Tests Total %"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Curitiba Tests"),
fig.line(dfTotals.index, dfTotals["Curitiba Tweets Total %"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "Curitiba Tweets")
show(fig)



In [46]:

    
sns.jointplot("Curitiba Total", "Twitter Total", dfTotals, kind="reg", color="#404040");



In [96]:

    
sns.jointplot("HIV Testing, Males", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [97]:

    
sns.jointplot("HIV Testing, Females", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [98]:

    
sns.jointplot("Primary Care, Females", "Prevention_Positive", df, kind="reg", color="#404040");

Anomaly Detection

We can only do anomaly detection on tweets, because we don't have enough data points for our ground-truth data. As we have very fine temporal information for all tweets (seconds), we can resample to look at hourly and daily aggregates. That shulkd give us enough data for anomaly detection.

First we resample to daily aggregates.



In [55]:

    
twitterDataDailyAgg = twitterDataSmall.resample('D', how='sum') # Resampling by summing each topic over each day
twitterDataDailyAgg['Twitter Total'] = twitterDataDailyAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataDailyAgg.head()









    Out[55]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
    
      origdate
      
      
      
      
      
      
      
      
    
  
  
    
      2014-01-01
      0
      15
      0
      0
      0
      1
      0
      16
    
    
      2014-01-02
      0
      16
      0
      0
      0
      0
      0
      16
    
    
      2014-01-03
      0
      15
      0
      0
      0
      0
      0
      15
    
    
      2014-01-04
      0
      13
      0
      0
      2
      4
      0
      19
    
    
      2014-01-05
      0
      12
      0
      0
      0
      4
      0
      16

As we'll be using r instead of Python for this, we'll use IPyhon's built-in r interpreter using the so called Magic Functions.



In [56]:

    
%load_ext rmagic









    



/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/extensions/rmagic.py:693: UserWarning: The rmagic extension in IPython is deprecated in favour of rpy2.ipython. If available, that will be loaded instead.
http://rpy.sourceforge.net/
  warnings.warn("The rmagic extension in IPython is deprecated in favour of "

%%R update.packages() install.packages("devtools") devtools::install_github("twitter/AnomalyDetection")



In [57]:

    
%R library(AnomalyDetection)









    Out[57]:





<StrVector - Python:0x7f6b1d9e5bd8 / R:0x3a06f68>
[str, str, str, ..., str, str, str]



In [58]:

    
# %%R
# help(AnomalyDetectionTs)
# help(AnomalyDetectionVec)



In [59]:

    
df_r = twitterDataDailyAgg['Twitter Total']
df_r.to_csv('TwitterDailyAgg.csv', header=['Twitter Total'], date_format='%Y-%m-%d')



In [60]:

    
!cat TwitterDailyAgg.csv | head









    



origdate,Twitter Total
2014-01-01,16.0
2014-01-02,16.0
2014-01-03,15.0
2014-01-04,19.0
2014-01-05,16.0
2014-01-06,21.0
2014-01-07,28.0
2014-01-08,14.0
2014-01-09,14.0



In [61]:

    
%%R

df_r = read.csv("TwitterDailyAgg.csv", stringsAsFactors=FALSE)



In [62]:

    
%R data(df_r)









    Out[62]:





<StrVector - Python:0x7f6b1d9e5878 / R:0x5cf56d8>
[str]



In [63]:

    
%R df_r









    Out[63]:





<DataFrame - Python:0x7f6b1d8fcef0 / R:0x4e64a38>
[StrVector, FloatVector]
  origdate: <class 'rpy2.robjects.vectors.StrVector'>
  <StrVector - Python:0x7f6b1d9795f0 / R:0x5baeb60>
[str, str, str, ..., str, str, str]
  Twitter.Total: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f6b1d9797e8 / R:0x5baf9d0>
[16.000000, 16.000000, 15.000000, ..., 11.000000, 12.000000, 17.000000]



In [64]:

    
%%R
data(df_r)
res = AnomalyDetectionTs(df_r, max_anoms=0.02, direction='both', plot=TRUE)
res$plot









    



Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero
In addition: Warning messages:
1: In data(df_r) : data set ‘df_r’ not found
2: In data(df_r) : data set ‘df_r’ not found
3: In max(ares) : no non-missing arguments to max; returning -Inf
4: In max(ares) : no non-missing arguments to max; returning -Inf
Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero

DELETE



In [51]:

    
%R raw_data









    Out[51]:





array([ <DataFrame - Python:0x7f36222810e0 / R:0x6e78748>
[Float..., IntVe..., IntVe..., ..., IntVe..., IntVe..., IntVe...]
  <no name>: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f3622281488 / R:0x785a320>
[0.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9ea8 / R:0x6611f80>
[       1,        2,        3, ...,       56,       57,       58]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e92d8 / R:0x7aae890>
[      14,       14,       14, ...,       13,       13,       13]
  ...
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e93f8 / R:0x6891040>
[       4,        4,        4, ...,        0,        0,        0]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9488 / R:0x76d8910>
[     268,      268,      268, ...,      278,      278,      278]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9248 / R:0x76e6a40>
[       0,        0,        0, ...,        0,        0,        0],
       <FloatVector - Python:0x7f36222811b8 / R:0x7a673c0>
[182.478000, 176.231000, 183.917000, ..., 153.776000, 150.481000, 146.638000]], dtype=object)



In [52]:

    
%%R

data(raw_data)
res = AnomalyDetectionTs(raw_data, max_anoms=0.02, direction='both', plot=TRUE)
res$plot

Styling



In [1]:

    
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)









    Out[1]:



In [ ]:

	City	State	Population
24	São Paulo	SP	11152968
21	Rio de Janeiro	RJ	6320446
22	Salvador	BA	2674923
4	Brasília	DF	2481272
9	Fortaleza	CE	2452185
2	Belo Horizonte	MG	2375151
14	Manaus	AM	1792881
7	Curitiba	PR	1751907
19	Recife	PE	1537704
17	Porto Alegre	RS	1409351
1	Belém	PA	1381475
10	Goiânia	GO	1297154
23	São Luís	MA	958545
13	Maceió	AL	932078
15	Natal	RN	803739
5	Campo Grande	MS	776242
25	Teresina	PI	767559
11	João Pessoa	PB	720954
0	Aracaju	SE	571149
6	Cuiabá	MT	540814
8	Florianópolis	SC	405189
18	Porto Velho	RO	392475
12	Macapá	AP	381091
26	Vitória	ES	327801
20	Rio Branco	AC	308545
3	Boa Vista	RR	277799
16	Palmas	TO	221742

	2014-01	2014-02	2014-03	2014-04	2014-05	2014-06	2014-07	2014-08	2014-09	2014-10	2014-11	2014-12	2015-01	2015-02	2015-03
HIV tests performed
Primary Health Care services (females)	1465	1425	1378	1477	1437	1174	1495	1267	1438	1413	720	686	1352	1277	1628
Primary Health Care services (males)	1057	1056	998	1115	1064	840	1197	1009	1250	1187	577	617	1157	912	1169
Pregnant women in public health care	2760	2360	2095	2258	2473	2086	2640	2020	2571	2499	1564	1666	3027	2130	2414
HIV testing center (males)	528	358	389	383	348	307	490	404	377	370	405	430	379	437	572
HIV testing center (females)	186	133	158	155	155	94	197	158	107	117	111	156	112	144	211
Private lab: DASA Total	4612	4560	4620	4527	4799	4156	4801	4695	4408	4726	4377	3612	4405	4256	5206

HIV tests performed	Primary Health Care services (females)	Primary Health Care services (males)	Pregnant women in public health care	HIV testing center (males)	HIV testing center (females)	Private lab: DASA Total
2014-01	1465	1057	2760	528	186	4612
2014-02	1425	1056	2360	358	133	4560
2014-03	1378	998	2095	389	158	4620
2014-04	1477	1115	2258	383	155	4527
2014-05	1437	1064	2473	348	155	4799
2014-06	1174	840	2086	307	94	4156
2014-07	1495	1197	2640	490	197	4801
2014-08	1267	1009	2020	404	158	4695
2014-09	1438	1250	2571	377	107	4408
2014-10	1413	1187	2499	370	117	4726
2014-11	720	577	1564	405	111	4377
2014-12	686	617	1666	430	156	3612
2015-01	1352	1157	3027	379	112	4405
2015-02	1277	912	2130	437	144	4256
2015-03	1628	1169	2414	572	211	5206

	Primary Care, Females	Primary Care, Males	HIV Testing, Males	HIV Testing, Females	Private lab: DASA Total
2014-01-01	1465	1057	528	186	4612
2014-02-01	1425	1056	358	133	4560
2014-03-01	1378	998	389	158	4620
2014-04-01	1477	1115	383	155	4527
2014-05-01	1437	1064	348	155	4799
2014-06-01	1174	840	307	94	4156
2014-07-01	1495	1197	490	197	4801
2014-08-01	1267	1009	404	158	4695
2014-09-01	1438	1250	377	107	4408
2014-10-01	1413	1187	370	117	4726
2014-11-01	720	577	405	111	4377
2014-12-01	686	617	430	156	3612
2015-01-01	1352	1157	379	112	4405
2015-02-01	1277	912	437	144	4256
2015-03-01	1628	1169	572	211	5206

	Primary Care, Females	Primary Care, Males	HIV Testing, Males	HIV Testing, Females	Private lab: DASA Total	Curitiba Total
Primary Care, Females	1.000000	0.928283	0.222689	0.385662	0.770246	0.942318
Primary Care, Males	0.928283	1.000000	0.137849	0.238497	0.674439	0.880188
HIV Testing, Males	0.222689	0.137849	1.000000	0.824896	0.378382	0.399023
HIV Testing, Females	0.385662	0.238497	0.824896	1.000000	0.496491	0.522953
Private lab: DASA Total	0.770246	0.674439	0.378382	0.496491	1.000000	0.922685
Curitiba Total	0.942318	0.880188	0.399023	0.522953	0.922685	1.000000

	city	lat	lon	topic
origdate
2014-04-03 21:23:26	Recife	-8.057838	-34.882897	Discrimination_Negative
2015-02-07 23:51:00	São Paulo	-23.500000	-46.600000	Discrimination_Negative
2014-11-12 20:35:08	Curitiba	-25.428954	-49.267137	Discrimination_Negative
2014-04-06 12:06:17	Curitiba	-25.428954	-49.267137	Discrimination_Negative
2014-09-13 15:38:28	São Paulo	-23.500000	-46.600000	Discrimination_Negative

	city	Tweets
24	São Paulo	20908
21	Rio de Janeiro	13271
17	Porto Alegre	10766
7	Curitiba	7546
1	Belo Horizonte	6933
4	Brasília	6492
2	Belém	4257
26	Vitória	3474
8	Florianópolis	3140
19	Recife	3090
9	Fortaleza	2622
14	Manaus	2047
22	Salvador	2036
15	Natal	1759
23	São Luís	1677
5	Campo Grande	1594
3	Boa Vista	1232
13	Maceió	1067
0	Aracaju	904
10	Goiânia	849
12	Macapá	846
6	Cuiabá	808
11	João Pessoa	797
25	Teresina	753
20	Rio Branco	629
18	Porto Velho	450
16	Palmas	186

	city	topic
count	7546	7546
unique	1	7
top	Curitiba	Discrimination_Negative
freq	7546	5697

	Campaign_Portuguese	Discrimination_Negative	Discrimination_Positive	Prevention_Negative	Prevention_Neutral	Prevention_Positive	Testing_Neutral	Twitter Total
Campaign_Portuguese	1.000000	-0.173546	-0.283673	-0.126847	0.155188	0.157308	0.720007	-0.171034
Discrimination_Negative	-0.173546	1.000000	-0.236608	0.124414	-0.237682	-0.033660	-0.591547	0.966719
Discrimination_Positive	-0.283673	-0.236608	1.000000	-0.292206	-0.264764	-0.506955	-0.130878	-0.104225
Prevention_Negative	-0.126847	0.124414	-0.292206	1.000000	-0.007390	0.106317	-0.302603	0.065439
Prevention_Neutral	0.155188	-0.237682	-0.264764	-0.007390	1.000000	0.516719	0.307874	-0.120481
Prevention_Positive	0.157308	-0.033660	-0.506955	0.106317	0.516719	1.000000	0.254708	0.065843
Testing_Neutral	0.720007	-0.591547	-0.130878	-0.302603	0.307874	0.254708	1.000000	-0.552325
Twitter Total	-0.171034	0.966719	-0.104225	0.065439	-0.120481	0.065843	-0.552325	1.000000

	Curitiba Total	Twitter Total	Curitiba Tests Total	Curitiba Tweets Total	Curitiba Tests Total %	Curitiba Tweets Total %
2014-01-01	7848	627	110968	7546	7.072309	8.309038
2014-02-01	7532	612	110968	7546	6.787542	8.110257
2014-03-01	7543	651	110968	7546	6.797455	8.627087
2014-04-01	7657	594	110968	7546	6.900187	7.871720
2014-05-01	7803	462	110968	7546	7.031757	6.122449
2014-06-01	6571	505	110968	7546	5.921527	6.692287
2014-07-01	8180	521	110968	7546	7.371494	6.904320
2014-08-01	7533	470	110968	7546	6.788444	6.228465
2014-09-01	7580	514	110968	7546	6.830798	6.811556
2014-10-01	7813	457	110968	7546	7.040769	6.056189
2014-11-01	6190	386	110968	7546	5.578185	5.115293
2014-12-01	5501	430	110968	7546	4.957285	5.698383
2015-01-01	7405	464	110968	7546	6.673095	6.148953
2015-02-01	7026	435	110968	7546	6.331555	5.764644
2015-03-01	8786	418	110968	7546	7.917598	5.539359

	Campaign_Portuguese	Discrimination_Negative	Discrimination_Positive	Prevention_Negative	Prevention_Neutral	Prevention_Positive	Testing_Neutral	Twitter Total
origdate
2014-01-01	0	15	0	0	0	1	0	16
2014-01-02	0	16	0	0	0	0	0	16
2014-01-03	0	15	0	0	0	0	0	15
2014-01-04	0	13	0	0	2	4	0	19
2014-01-05	0	12	0	0	0	4	0	16